IMP: add protein types and formats

qiime2 · Oct 12, 2020 · b1e026a · b1e026a
1 parent af9e2be
commit b1e026a
Show file tree

Hide file tree

Showing 10 changed files with 509 additions and 15 deletions.
diff --git a/q2_types/feature_data/__init__.py b/q2_types/feature_data/__init__.py
@@ -14,13 +14,17 @@
     TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
     PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
     AlignedDNASequencesDirectoryFormat, DifferentialFormat,
-    DifferentialDirectoryFormat)
+    DifferentialDirectoryFormat, AlignedProteinSequencesDirectoryFormat,
+    ProteinSequencesDirectoryFormat, ProteinFASTAFormat,
+    AlignedProteinFASTAFormat)
 from ._type import (
     FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
-    Differential)
+    Differential, ProteinSequence, AlignedProteinSequence)
 
 # TODO remove these imports when tests are rewritten. Remove from __all__ too
-from ._transformer import DNAIterator, PairedDNAIterator, AlignedDNAIterator
+from ._transformer import (
+    DNAIterator, PairedDNAIterator, AlignedDNAIterator,
+    ProteinIterator, AlignedProteinIterator)
 
 __all__ = [
     'TaxonomyFormat', 'TaxonomyDirectoryFormat', 'HeaderlessTSVTaxonomyFormat',
@@ -30,6 +34,10 @@
     'AlignedDNAFASTAFormat', 'AlignedDNASequencesDirectoryFormat',
     'FeatureData', 'Taxonomy', 'Sequence', 'PairedEndSequence',
     'AlignedSequence', 'DNAIterator', 'PairedDNAIterator',
-    'AlignedDNAIterator', 'Differential', 'DifferentialDirectoryFormat']
+    'AlignedDNAIterator', 'Differential', 'DifferentialDirectoryFormat',
+    'ProteinFASTAFormat', 'AlignedProteinFASTAFormat',
+    'ProteinSequence', 'ProteinSequencesDirectoryFormat',
+    'AlignedProteinSequence', 'AlignedProteinSequencesDirectoryFormat',
+    'ProteinIterator', 'AlignedProteinIterator']
 
 importlib.import_module('q2_types.feature_data._transformer')
diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py
@@ -5,10 +5,11 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
+import itertools
 import re
 
 import qiime2.plugin.model as model
+import skbio
 from qiime2.plugin import ValidationError
 import qiime2
 
@@ -299,11 +300,60 @@ def validate(self, *args):
     'DifferentialDirectoryFormat', 'differentials.tsv', DifferentialFormat)
 
 
+class ProteinFASTAFormat(model.TextFileFormat):
+    def _validate_(self, level):
+        record_count_map = {'min': 5, 'max': None}
+        self._validate(record_count_map[level])
+
+    def _validate(self, n_records=None):
+        # read in using skbio and iterate over the contents -
+        # ValueErrors will be raised for wrong records
+        generator = self._read_protein_fasta(str(self))
+        if n_records is not None:
+            generator = itertools.islice(generator, n_records)
+        [x for x in generator]
+
+    def _read_protein_fasta(self, path):
+        return skbio.read(path, format='fasta', constructor=skbio.Protein)
+
+
+ProteinSequencesDirectoryFormat = model.SingleFileDirectoryFormat(
+    'ProteinSequencesDirectoryFormat',
+    'protein-sequences.fasta',
+    ProteinFASTAFormat)
+
+
+class AlignedProteinFASTAFormat(model.TextFileFormat):
+    def _validate_(self, level):
+        record_count_map = {'min': 5, 'max': None}
+        self._validate(record_count_map[level])
+
+    def _validate(self, n_records=None):
+        # read in using skbio and iterate over the contents -
+        # ValueErrors will be raised for wrong records
+        generator = self._read_protein_alignment_fasta(str(self))
+        if n_records is not None:
+            generator = itertools.islice(generator, n_records)
+        [x for x in generator]
+
+    def _read_protein_alignment_fasta(self, path):
+        return skbio.read(path, format='fasta',
+                          constructor=skbio.Protein, into=skbio.TabularMSA)
+
+
+AlignedProteinSequencesDirectoryFormat = model.SingleFileDirectoryFormat(
+    'AlignedProteinSequencesDirectoryFormat',
+    'aligned-protein-sequences.fasta',
+    AlignedProteinFASTAFormat)
+
+
 plugin.register_formats(
     TSVTaxonomyFormat, TSVTaxonomyDirectoryFormat,
     HeaderlessTSVTaxonomyFormat, HeaderlessTSVTaxonomyDirectoryFormat,
     TaxonomyFormat, TaxonomyDirectoryFormat, DNAFASTAFormat,
     DNASequencesDirectoryFormat, PairedDNASequencesDirectoryFormat,
     AlignedDNAFASTAFormat, AlignedDNASequencesDirectoryFormat,
-    DifferentialFormat, DifferentialDirectoryFormat
+    DifferentialFormat, DifferentialDirectoryFormat, ProteinFASTAFormat,
+    AlignedProteinFASTAFormat, ProteinSequencesDirectoryFormat,
+    AlignedProteinSequencesDirectoryFormat,
 )
diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py
@@ -18,7 +18,8 @@
 from ..feature_table import BIOMV210Format
 from . import (TaxonomyFormat, HeaderlessTSVTaxonomyFormat, TSVTaxonomyFormat,
                DNAFASTAFormat, PairedDNASequencesDirectoryFormat,
-               AlignedDNAFASTAFormat, DifferentialFormat)
+               AlignedDNAFASTAFormat, DifferentialFormat, ProteinFASTAFormat,
+               AlignedProteinFASTAFormat)
 
 
 # Taxonomy format transformers
@@ -262,10 +263,14 @@ def _dnafastaformats_to_metadata(ff):
     return qiime2.Metadata(df)
 
 
-def _series_to_fasta_format(ff, data):
+def _series_to_fasta_format(ff, data, sequence_type="DNA"):
     with ff.open() as f:
         for id_, seq in data.iteritems():
-            sequence = skbio.DNA(seq, metadata={'id': id_})
+            sequence = skbio.Protein(
+                seq, metadata={
+                    'id': id_}) if sequence_type == "protein" else skbio.DNA(
+                seq, metadata={
+                    'id': id_})
             skbio.io.write(sequence, format='fasta', into=f)
 
 
@@ -402,3 +407,119 @@ def _224(data: pd.DataFrame) -> DifferentialFormat:
     ff = DifferentialFormat()
     qiime2.Metadata(data).save(str(ff))
     return ff
+
+
+# Protein FASTA transformers
+
+class ProteinIterator(collections.abc.Iterable):
+    def __init__(self, generator):
+        self.generator = generator
+
+    def __iter__(self):
+        yield from self.generator
+
+
+class AlignedProteinIterator(ProteinIterator):
+    pass
+
+
+def _read_protein_fasta(path):
+    return skbio.read(path, format='fasta', constructor=skbio.Protein)
+
+
+def _proteinfastaformats_to_series(ff):
+    data = {}
+    for sequence in _read_protein_fasta(str(ff)):
+        id_ = sequence.metadata['id']
+        if id_ in data:
+            raise ValueError("FASTA format sequence IDs must be unique. The "
+                             "following ID was found more than once: %s."
+                             % id_)
+        data[id_] = sequence
+    return pd.Series(data)
+
+
+def _proteinfastaformats_to_metadata(ff):
+    df = _proteinfastaformats_to_series(ff).to_frame()
+    df = df.astype(str)
+    df.index.name, df.columns = 'Feature ID', ['Sequence']
+    return qiime2.Metadata(df)
+
+
+@plugin.register_transformer
+def _37(ff: ProteinFASTAFormat) -> ProteinIterator:
+    generator = _read_protein_fasta(str(ff))
+    return ProteinIterator(generator)
+
+
+@plugin.register_transformer
+def _38(data: ProteinIterator) -> ProteinFASTAFormat:
+    ff = ProteinFASTAFormat()
+    skbio.io.write(iter(data), format='fasta', into=str(ff))
+    return ff
+
+
+@plugin.register_transformer
+def _39(ff: AlignedProteinFASTAFormat) -> skbio.TabularMSA:
+    return skbio.TabularMSA.read(str(ff), constructor=skbio.Protein,
+                                 format='fasta')
+
+
+@plugin.register_transformer
+def _40(data: skbio.TabularMSA) -> AlignedProteinFASTAFormat:
+    ff = AlignedProteinFASTAFormat()
+    data.write(str(ff), format='fasta')
+    return ff
+
+
+@plugin.register_transformer
+def _41(ff: ProteinFASTAFormat) -> pd.Series:
+    return _proteinfastaformats_to_series(ff)
+
+
+@plugin.register_transformer
+def _42(ff: ProteinFASTAFormat) -> qiime2.Metadata:
+    return _proteinfastaformats_to_metadata(ff)
+
+
+@plugin.register_transformer
+def _43(data: pd.Series) -> ProteinFASTAFormat:
+    ff = ProteinFASTAFormat()
+    _series_to_fasta_format(ff, data, "protein")
+    return ff
+
+
+@plugin.register_transformer
+def _44(ff: AlignedProteinFASTAFormat) -> AlignedProteinIterator:
+    generator = _read_protein_fasta(str(ff))
+    return AlignedProteinIterator(generator)
+
+
+@plugin.register_transformer
+def _45(data: AlignedProteinIterator) -> AlignedProteinFASTAFormat:
+    ff = AlignedProteinFASTAFormat()
+    skbio.io.write(iter(data), format='fasta', into=str(ff))
+    return ff
+
+
+@plugin.register_transformer
+def _46(ff: AlignedProteinFASTAFormat) -> qiime2.Metadata:
+    return _proteinfastaformats_to_metadata(ff)
+
+
+@plugin.register_transformer
+def _47(ff: AlignedProteinFASTAFormat) -> pd.Series:
+    return _proteinfastaformats_to_series(ff)
+
+
+@plugin.register_transformer
+def _48(data: pd.Series) -> AlignedProteinFASTAFormat:
+    ff = AlignedProteinFASTAFormat()
+    _series_to_fasta_format(ff, data, "protein")
+    return ff
+
+
+@plugin.register_transformer
+def _49(fmt: AlignedProteinFASTAFormat) -> ProteinIterator:
+    generator = _read_protein_fasta(str(fmt))
+    return ProteinIterator(generator)
diff --git a/q2_types/feature_data/_type.py b/q2_types/feature_data/_type.py
@@ -12,7 +12,8 @@
 from . import (TSVTaxonomyDirectoryFormat, DNASequencesDirectoryFormat,
                PairedDNASequencesDirectoryFormat,
                AlignedDNASequencesDirectoryFormat,
-               DifferentialDirectoryFormat)
+               DifferentialDirectoryFormat, ProteinSequencesDirectoryFormat,
+               AlignedProteinSequencesDirectoryFormat)
 
 
 FeatureData = SemanticType('FeatureData', field_names='type')
@@ -30,9 +31,16 @@
 Differential = SemanticType('Differential',
                             variant_of=FeatureData.field['type'])
 
+ProteinSequence = SemanticType('ProteinSequence',
+                               variant_of=FeatureData.field['type'])
+
+AlignedProteinSequence = SemanticType('AlignedProteinSequence',
+                                      variant_of=FeatureData.field['type'])
+
 plugin.register_semantic_types(FeatureData, Taxonomy, Sequence,
                                PairedEndSequence, AlignedSequence,
-                               Differential)
+                               Differential, ProteinSequence,
+                               AlignedProteinSequence)
 
 
 plugin.register_semantic_type_to_format(
@@ -49,3 +57,9 @@
     artifact_format=AlignedDNASequencesDirectoryFormat)
 plugin.register_semantic_type_to_format(
     FeatureData[Differential], DifferentialDirectoryFormat)
+plugin.register_semantic_type_to_format(
+    FeatureData[ProteinSequence],
+    artifact_format=ProteinSequencesDirectoryFormat)
+plugin.register_semantic_type_to_format(
+    FeatureData[AlignedProteinSequence],
+    artifact_format=AlignedProteinSequencesDirectoryFormat)
diff --git a/q2_types/feature_data/tests/data/aligned-protein-sequences.fasta b/q2_types/feature_data/tests/data/aligned-protein-sequences.fasta
@@ -0,0 +1,6 @@
+>sequence1
+------------------------VDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
+VASECEVKCMPTFQFFKKGQKVGEFSGAN
+>sequence2
+MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
+VASECEVKCMPTFQ-------VGEFSGAN
diff --git a/q2_types/feature_data/tests/data/protein-sequences-duplicate-ids.fasta b/q2_types/feature_data/tests/data/protein-sequences-duplicate-ids.fasta
@@ -0,0 +1,8 @@
+>sequence1
+MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEKHPDVVHAKVDTEAERELA
+AAAQIR
+>sequence2
+MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
+VASECEVKCMPTFQFFKKGQKVGEFSGAN
+>sequence1
+AFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQDQKVGEFSGA
diff --git a/q2_types/feature_data/tests/data/protein-sequences.fasta b/q2_types/feature_data/tests/data/protein-sequences.fasta
@@ -0,0 +1,6 @@
+>sequence1
+MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEKHPDVVHAKVDTEAERELA
+AAAQIR
+>sequence2
+MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
+VASECEVKCMPTFQFFKKGQKVGEFSGAN
diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py
@@ -16,7 +16,9 @@
     HeaderlessTSVTaxonomyDirectoryFormat, TSVTaxonomyFormat,
     TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
     PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
-    AlignedDNASequencesDirectoryFormat, DifferentialDirectoryFormat
+    AlignedDNASequencesDirectoryFormat, DifferentialDirectoryFormat,
+    ProteinFASTAFormat, AlignedProteinFASTAFormat,
+    AlignedProteinSequencesDirectoryFormat, ProteinSequencesDirectoryFormat
 )
 from qiime2.plugin.testing import TestPluginBase
 from qiime2.plugin import ValidationError
@@ -342,5 +344,63 @@ def test_differential_format_bad_type(self):
             format.validate()
 
 
+class TestProteinFASTAFormats(TestPluginBase):
+    package = 'q2_types.feature_data.tests'
+
+    def test_protein_fasta_format_validate_positive(self):
+        filepath = self.get_data_path('protein-sequences.fasta')
+        format = ProteinFASTAFormat(filepath, mode='r')
+
+        format.validate()
+
+    def test_protein_fasta_format_invalid_characters(self):
+        filepath = self.get_data_path('not-dna-sequences.fasta')
+        format = ProteinFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(
+                ValueError, "Invalid characters in sequence"):
+            format.validate()
+
+    def test_protein_fasta_format_empty_file(self):
+        filepath = os.path.join(self.temp_dir.name, 'empty')
+        with open(filepath, 'w') as fh:
+            fh.write('\n')
+        format = ProteinFASTAFormat(filepath, mode='r')
+
+        format.validate()
+
+    def test_protein_sequences_directory_format(self):
+        filepath = self.get_data_path('protein-sequences.fasta')
+        shutil.copy(filepath,
+                    os.path.join(
+                        self.temp_dir.name, 'protein-sequences.fasta'))
+        format = ProteinSequencesDirectoryFormat(self.temp_dir.name, mode='r')
+
+        format.validate()
+
+    def test_aligned_protein_fasta_format_validate_positive(self):
+        filepath = self.get_data_path('aligned-protein-sequences.fasta')
+        format = AlignedProteinFASTAFormat(filepath, mode='r')
+
+        format.validate()
+
+    def test_aligned_protein_fasta_format_unaligned(self):
+        filepath = self.get_data_path('protein-sequences.fasta')
+        format = AlignedProteinFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValueError,
+                                    'length must match.* 93.* 70'):
+            format.validate()
+
+    def test_aligned_protein_sequences_directory_format(self):
+        filepath = self.get_data_path('aligned-protein-sequences.fasta')
+        temp_dir = self.temp_dir.name
+        shutil.copy(filepath,
+                    os.path.join(temp_dir, 'aligned-protein-sequences.fasta'))
+        format = AlignedProteinSequencesDirectoryFormat(temp_dir, mode='r')
+
+        format.validate()
+
+
 if __name__ == '__main__':
     unittest.main()