diff --git a/qiita_db/metadata_template/base_metadata_template.py b/qiita_db/metadata_template/base_metadata_template.py index 26c24c6b7..2a105a1f2 100644 --- a/qiita_db/metadata_template/base_metadata_template.py +++ b/qiita_db/metadata_template/base_metadata_template.py @@ -39,20 +39,24 @@ from future.utils import viewitems from os.path import join from functools import partial +from copy import deepcopy import pandas as pd +from skbio.util import find_duplicates from qiita_core.exceptions import IncompetentQiitaDeveloperError -from qiita_db.exceptions import (QiitaDBUnknownIDError, + +from qiita_db.exceptions import (QiitaDBUnknownIDError, QiitaDBColumnError, QiitaDBNotImplementedError, - QiitaDBColumnError) + QiitaDBDuplicateHeaderError) from qiita_db.base import QiitaObject from qiita_db.sql_connection import SQLConnectionHandler from qiita_db.util import (exists_table, get_table_cols, convert_to_id, get_mountpoint, insert_filepaths) from qiita_db.logger import LogEntry -from .util import as_python_types, get_datatypes +from .util import (as_python_types, get_datatypes, get_invalid_sample_names, + prefix_sample_names_with_id) class BaseSample(QiitaObject): @@ -566,9 +570,8 @@ def _check_special_columns(cls, md_template, obj): ---------- md_template : DataFrame The metadata template file contents indexed by sample ids - obj : Study or RawData - The obj to which the metadata template belongs to. Study in case - of SampleTemplate and RawData in case of PrepTemplate + obj : object + Any extra object needed by the template to perform any extra check """ # Check required columns missing = set(cls.translate_cols_dict.values()).difference(md_template) @@ -584,6 +587,82 @@ def _check_special_columns(cls, md_template, obj): return missing.union( cls._check_template_special_columns(md_template, obj)) + @classmethod + def _clean_validate_template(cls, md_template, study_id, obj, + conn_handler=None): + """Takes care of all validation and cleaning of metadata templates + + Parameters + ---------- + md_template : DataFrame + The metadata template file contents indexed by sample ids + study_id : int + The study to which the metadata template belongs to. + obj : object + Any extra object needed by the template to perform any extra check + + Returns + ------- + md_template : DataFrame + Cleaned copy of the input md_template + + Raises + ------ + QiitaDBColumnError + If the sample names in md_template contains invalid names + QiitaDBDuplicateHeaderError + If md_template contains duplicate headers + QiitaDBColumnError + If md_template is missing a required column + """ + cls._check_subclass() + invalid_ids = get_invalid_sample_names(md_template.index) + if invalid_ids: + raise QiitaDBColumnError("The following sample names in the " + "template contain invalid characters " + "(only alphanumeric characters or periods" + " are allowed): %s." % + ", ".join(invalid_ids)) + # We are going to modify the md_template. We create a copy so + # we don't modify the user one + md_template = deepcopy(md_template) + + # Prefix the sample names with the study_id + prefix_sample_names_with_id(md_template, study_id) + + # In the database, all the column headers are lowercase + md_template.columns = [c.lower() for c in md_template.columns] + + # Check that we don't have duplicate columns + if len(set(md_template.columns)) != len(md_template.columns): + raise QiitaDBDuplicateHeaderError( + find_duplicates(md_template.columns)) + + # We need to check for some special columns, that are not present on + # the database, but depending on the data type are required. + missing = cls._check_special_columns(md_template, obj) + + conn_handler = conn_handler if conn_handler else SQLConnectionHandler() + + # Get the required columns from the DB + db_cols = get_table_cols(cls._table, conn_handler) + + # Remove the sample_id and study_id columns + db_cols.remove('sample_id') + db_cols.remove(cls._id_column) + + # Retrieve the headers of the metadata template + headers = list(md_template.keys()) + + # Check that md_template has the required columns + remaining = set(db_cols).difference(headers) + missing = missing.union(remaining) + missing = missing.difference(cls.translate_cols_dict) + if missing: + raise QiitaDBColumnError("Missing columns: %s" + % ', '.join(missing)) + return md_template + @classmethod def _add_common_creation_steps_to_queue(cls, md_template, obj_id, conn_handler, queue_name): diff --git a/qiita_db/metadata_template/prep_template.py b/qiita_db/metadata_template/prep_template.py index 1b93b6db2..e76eac0b4 100644 --- a/qiita_db/metadata_template/prep_template.py +++ b/qiita_db/metadata_template/prep_template.py @@ -7,23 +7,18 @@ # ----------------------------------------------------------------------------- from __future__ import division -from copy import deepcopy from os.path import join from time import strftime -from skbio.util import find_duplicates - from qiita_core.exceptions import IncompetentQiitaDeveloperError from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBUnknownIDError, - QiitaDBDuplicateHeaderError, QiitaDBError, - QiitaDBExecutionError) + QiitaDBError, QiitaDBExecutionError) from qiita_db.sql_connection import SQLConnectionHandler from qiita_db.ontology import Ontology -from qiita_db.util import (get_table_cols, get_emp_status, convert_to_id, +from qiita_db.util import (get_emp_status, convert_to_id, convert_from_id, get_mountpoint, infer_status) from .base_metadata_template import BaseSample, MetadataTemplate -from .util import (get_invalid_sample_names, prefix_sample_names_with_id, - load_template_to_dataframe) +from .util import load_template_to_dataframe from .constants import (TARGET_GENE_DATA_TYPES, RENAME_COLS_DICT, REQUIRED_TARGET_GENE_COLS) @@ -109,29 +104,6 @@ def create(cls, md_template, raw_data, study, data_type, if investigation_type is not None: cls.validate_investigation_type(investigation_type) - invalid_ids = get_invalid_sample_names(md_template.index) - if invalid_ids: - raise QiitaDBColumnError("The following sample names in the prep" - " template contain invalid characters " - "(only alphanumeric characters or periods" - " are allowed): %s." % - ", ".join(invalid_ids)) - - # We are going to modify the md_template. We create a copy so - # we don't modify the user one - md_template = deepcopy(md_template) - - # Prefix the sample names with the study_id - prefix_sample_names_with_id(md_template, study.id) - - # In the database, all the column headers are lowercase - md_template.columns = [c.lower() for c in md_template.columns] - - # Check that we don't have duplicate columns - if len(set(md_template.columns)) != len(md_template.columns): - raise QiitaDBDuplicateHeaderError( - find_duplicates(md_template.columns)) - # Get a connection handler conn_handler = SQLConnectionHandler() queue_name = "CREATE_PREP_TEMPLATE_%d" % raw_data.id @@ -146,27 +118,8 @@ def create(cls, md_template, raw_data, study, data_type, data_type_id = convert_to_id(data_type, "data_type", conn_handler) data_type_str = data_type - # We need to check for some special columns, that are not present on - # the database, but depending on the data type are required. - missing = cls._check_special_columns(md_template, data_type_str) - - # Get the required columns from the DB - db_cols = get_table_cols(cls._table, conn_handler) - - # Remove the sample_id and study_id columns - db_cols.remove('sample_id') - db_cols.remove(cls._id_column) - - # Retrieve the headers of the metadata template - headers = list(md_template.keys()) - - # Check that md_template has the required columns - remaining = set(db_cols).difference(headers) - missing = missing.union(remaining) - missing = missing.difference(cls.translate_cols_dict) - if missing: - raise QiitaDBColumnError("Missing columns: %s" - % ', '.join(missing)) + md_template = cls._clean_validate_template(md_template, study.id, + data_type_str, conn_handler) # Insert the metadata template # We need the prep_id for multiple calls below, which currently is not diff --git a/qiita_db/metadata_template/sample_template.py b/qiita_db/metadata_template/sample_template.py index 4a5574a96..4257a40bc 100644 --- a/qiita_db/metadata_template/sample_template.py +++ b/qiita_db/metadata_template/sample_template.py @@ -8,18 +8,15 @@ from __future__ import division from future.builtins import zip -from copy import deepcopy from os.path import join from time import strftime from os.path import basename import pandas as pd import warnings -from skbio.util import find_duplicates from qiita_core.exceptions import IncompetentQiitaDeveloperError -from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBColumnError, - QiitaDBDuplicateHeaderError, QiitaDBError, +from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBError, QiitaDBWarning) from qiita_db.sql_connection import SQLConnectionHandler from qiita_db.util import (get_table_cols, get_required_sample_info_status, @@ -27,8 +24,7 @@ from qiita_db.study import Study from qiita_db.data import RawData from .base_metadata_template import BaseSample, MetadataTemplate -from .util import (get_invalid_sample_names, prefix_sample_names_with_id, - as_python_types, get_datatypes) +from .util import as_python_types, get_datatypes from .prep_template import PrepTemplate @@ -114,70 +110,6 @@ def _check_template_special_columns(cls, md_template, study_id): """ return set() - @classmethod - def _clean_validate_template(cls, md_template, study_id, - conn_handler=None): - """Takes care of all validation and cleaning of sample templates - - Parameters - ---------- - md_template : DataFrame - The metadata template file contents indexed by sample ids - study_id : int - The study to which the sample template belongs to. - - Returns - ------- - md_template : DataFrame - Cleaned copy of the input md_template - """ - invalid_ids = get_invalid_sample_names(md_template.index) - if invalid_ids: - raise QiitaDBColumnError("The following sample names in the sample" - " template contain invalid characters " - "(only alphanumeric characters or periods" - " are allowed): %s." % - ", ".join(invalid_ids)) - # We are going to modify the md_template. We create a copy so - # we don't modify the user one - md_template = deepcopy(md_template) - - # Prefix the sample names with the study_id - prefix_sample_names_with_id(md_template, study_id) - - # In the database, all the column headers are lowercase - md_template.columns = [c.lower() for c in md_template.columns] - - # Check that we don't have duplicate columns - if len(set(md_template.columns)) != len(md_template.columns): - raise QiitaDBDuplicateHeaderError( - find_duplicates(md_template.columns)) - - # We need to check for some special columns, that are not present on - # the database, but depending on the data type are required. - missing = cls._check_special_columns(md_template, study_id) - - conn_handler = conn_handler if conn_handler else SQLConnectionHandler() - - # Get the required columns from the DB - db_cols = get_table_cols(cls._table, conn_handler) - - # Remove the sample_id and study_id columns - db_cols.remove('sample_id') - db_cols.remove(cls._id_column) - - # Retrieve the headers of the metadata template - headers = list(md_template.keys()) - - # Check that md_template has the required columns - remaining = set(db_cols).difference(headers) - missing = missing.union(remaining) - missing = missing.difference(cls.translate_cols_dict) - if missing: - raise QiitaDBColumnError("Missing columns: %s" - % ', '.join(missing)) - return md_template - @classmethod def create(cls, md_template, study): r"""Creates the sample template in the database @@ -201,7 +133,7 @@ def create(cls, md_template, study): # Clean and validate the metadata template given md_template = cls._clean_validate_template(md_template, study.id, - conn_handler) + study.id, conn_handler) cls._add_common_creation_steps_to_queue(md_template, study.id, conn_handler, queue_name) diff --git a/qiita_db/metadata_template/test/test_base_metadata_template.py b/qiita_db/metadata_template/test/test_base_metadata_template.py index fcbc16a86..4d7cfa184 100644 --- a/qiita_db/metadata_template/test/test_base_metadata_template.py +++ b/qiita_db/metadata_template/test/test_base_metadata_template.py @@ -58,6 +58,11 @@ def test_add_common_creation_steps_to_queue(self): MetadataTemplate._add_common_creation_steps_to_queue( None, 1, None, "") + def test_clean_validate_template(self): + """_clean_validate_template raises an error from base class""" + with self.assertRaises(IncompetentQiitaDeveloperError): + MetadataTemplate._clean_validate_template(None, 1, None, None) + @qiita_test_checker() class TestMetadataTemplateReadWrite(TestCase): diff --git a/qiita_db/metadata_template/test/test_prep_template.py b/qiita_db/metadata_template/test/test_prep_template.py index 2e51e92a4..dc2e05a9e 100644 --- a/qiita_db/metadata_template/test/test_prep_template.py +++ b/qiita_db/metadata_template/test/test_prep_template.py @@ -628,6 +628,89 @@ def test_add_common_creation_steps_to_queue(self): self.assertEqual(conn_handler.queues[queue_name], exp) + def test_clean_validate_template_error_bad_chars(self): + """Raises an error if there are invalid characters in the sample names + """ + conn_handler = SQLConnectionHandler() + self.metadata.index = ['o()xxxx[{::::::::>', 'sample.1', 'sample.3'] + with self.assertRaises(QiitaDBColumnError): + PrepTemplate._clean_validate_template(self.metadata, 2, "16S", + conn_handler) + + def test_clean_validate_template_error_duplicate_cols(self): + """Raises an error if there are duplicated columns in the template""" + conn_handler = SQLConnectionHandler() + self.metadata['STR_COLUMN'] = pd.Series(['', '', ''], + index=self.metadata.index) + with self.assertRaises(QiitaDBDuplicateHeaderError): + PrepTemplate._clean_validate_template(self.metadata, 2, "16S", + conn_handler) + + def test_clean_validate_template_error_missing(self): + """Raises an error if the template is missing a required column""" + conn_handler = SQLConnectionHandler() + metadata_dict = { + 'SKB8.640193': {'center_name': 'ANL', + 'center_project_name': 'Test Project', + 'ebi_submission_accession': None, + 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', + 'barcodesequence': 'GTCCGCAAGTTA', + 'run_prefix': "s_G1_L001_sequences", + 'platform': 'ILLUMINA', + 'library_construction_protocol': 'AAAA', + 'experiment_design_description': 'BBBB'} + } + metadata = pd.DataFrame.from_dict(metadata_dict, orient='index') + with self.assertRaises(QiitaDBColumnError): + PrepTemplate._clean_validate_template(metadata, 2, "16S", + conn_handler) + + def test_clean_validate_template(self): + conn_handler = SQLConnectionHandler() + obs = PrepTemplate._clean_validate_template(self.metadata, 2, "16S", + conn_handler) + metadata_dict = { + '2.SKB8.640193': {'center_name': 'ANL', + 'center_project_name': 'Test Project', + 'ebi_submission_accession': None, + 'emp_status_id': 1, + 'str_column': 'Value for sample 1', + 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', + 'barcodesequence': 'GTCCGCAAGTTA', + 'run_prefix': "s_G1_L001_sequences", + 'platform': 'ILLUMINA', + 'library_construction_protocol': 'AAAA', + 'experiment_design_description': 'BBBB'}, + '2.SKD8.640184': {'center_name': 'ANL', + 'center_project_name': 'Test Project', + 'ebi_submission_accession': None, + 'emp_status_id': 1, + 'str_column': 'Value for sample 2', + 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', + 'barcodesequence': 'CGTAGAGCTCTC', + 'run_prefix': "s_G1_L001_sequences", + 'platform': 'ILLUMINA', + 'library_construction_protocol': 'AAAA', + 'experiment_design_description': 'BBBB'}, + '2.SKB7.640196': {'center_name': 'ANL', + 'center_project_name': 'Test Project', + 'ebi_submission_accession': None, + 'emp_status_id': 1, + 'str_column': 'Value for sample 3', + 'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA', + 'barcodesequence': 'CCTCTGAGAGCT', + 'run_prefix': "s_G1_L002_sequences", + 'platform': 'ILLUMINA', + 'library_construction_protocol': 'AAAA', + 'experiment_design_description': 'BBBB'} + } + exp = pd.DataFrame.from_dict(metadata_dict, orient='index') + obs.sort_index(axis=0, inplace=True) + obs.sort_index(axis=1, inplace=True) + exp.sort_index(axis=0, inplace=True) + exp.sort_index(axis=1, inplace=True) + assert_frame_equal(obs, exp) + @qiita_test_checker() class TestPrepTemplateReadWrite(BaseTestPrepTemplate): diff --git a/qiita_db/metadata_template/test/test_sample_template.py b/qiita_db/metadata_template/test/test_sample_template.py index 4564f67e3..75be01437 100644 --- a/qiita_db/metadata_template/test/test_sample_template.py +++ b/qiita_db/metadata_template/test/test_sample_template.py @@ -15,6 +15,7 @@ import numpy.testing as npt import pandas as pd +from pandas.util.testing import assert_frame_equal from qiita_core.util import qiita_test_checker from qiita_core.exceptions import IncompetentQiitaDeveloperError @@ -845,6 +846,95 @@ def test_add_common_creation_steps_to_queue(self): (sql_insert_dynamic, ('2.Sample3', 3, 'Value for sample 3'))] self.assertEqual(conn_handler.queues[queue_name], exp) + def test_clean_validate_template_error_bad_chars(self): + """Raises an error if there are invalid characters in the sample names + """ + conn_handler = SQLConnectionHandler() + self.metadata.index = ['o()xxxx[{::::::::>', 'sample.1', 'sample.3'] + with self.assertRaises(QiitaDBColumnError): + SampleTemplate._clean_validate_template(self.metadata, 2, 2, + conn_handler) + + def test_clean_validate_template_error_duplicate_cols(self): + """Raises an error if there are duplicated columns in the template""" + conn_handler = SQLConnectionHandler() + self.metadata['STR_COLUMN'] = pd.Series(['', '', ''], + index=self.metadata.index) + with self.assertRaises(QiitaDBDuplicateHeaderError): + SampleTemplate._clean_validate_template(self.metadata, 2, 2, + conn_handler) + + def test_clean_valdate_template_error_missing(self): + """Raises an error if the template is missing a required column""" + conn_handler = SQLConnectionHandler() + metadata_dict = { + 'Sample1': {'physical_location': 'location1', + 'has_physical_specimen': True, + 'has_extracted_data': True, + 'sample_type': 'type1', + 'required_sample_info_status': 'received', + 'host_subject_id': 'NotIdentified', + 'Description': 'Test Sample 1', + 'latitude': 42.42, + 'longitude': 41.41} + } + metadata = pd.DataFrame.from_dict(metadata_dict, orient='index') + with self.assertRaises(QiitaDBColumnError): + SampleTemplate._clean_validate_template(metadata, 2, 2, + conn_handler) + + def test_clean_valdate_template(self): + conn_handler = SQLConnectionHandler() + obs = SampleTemplate._clean_validate_template(self.metadata, 2, 2, + conn_handler) + metadata_dict = { + '2.Sample1': {'physical_location': 'location1', + 'has_physical_specimen': True, + 'has_extracted_data': True, + 'sample_type': 'type1', + 'required_sample_info_status_id': 1, + 'collection_timestamp': + datetime(2014, 5, 29, 12, 24, 51), + 'host_subject_id': 'NotIdentified', + 'description': 'Test Sample 1', + 'str_column': 'Value for sample 1', + 'int_column': 1, + 'latitude': 42.42, + 'longitude': 41.41}, + '2.Sample2': {'physical_location': 'location1', + 'has_physical_specimen': True, + 'has_extracted_data': True, + 'sample_type': 'type1', + 'int_column': 2, + 'required_sample_info_status_id': 1, + 'collection_timestamp': + datetime(2014, 5, 29, 12, 24, 51), + 'host_subject_id': 'NotIdentified', + 'description': 'Test Sample 2', + 'str_column': 'Value for sample 2', + 'latitude': 4.2, + 'longitude': 1.1}, + '2.Sample3': {'physical_location': 'location1', + 'has_physical_specimen': True, + 'has_extracted_data': True, + 'sample_type': 'type1', + 'required_sample_info_status_id': 1, + 'collection_timestamp': + datetime(2014, 5, 29, 12, 24, 51), + 'host_subject_id': 'NotIdentified', + 'description': 'Test Sample 3', + 'str_column': 'Value for sample 3', + 'int_column': 3, + 'latitude': 4.8, + 'longitude': 4.41}, + } + exp = pd.DataFrame.from_dict(metadata_dict, orient='index') + obs.sort_index(axis=0, inplace=True) + obs.sort_index(axis=1, inplace=True) + exp.sort_index(axis=0, inplace=True) + exp.sort_index(axis=1, inplace=True) + assert_frame_equal(obs, exp) + @qiita_test_checker() class TestSampleTemplateReadWrite(BaseTestSampleTemplate):