Merge pull request #2075 from antgonza/fix-timestamp

Fix timestamp / -> -
qiita-spots · Feb 15, 2017 · aa68a21 · aa68a21
2 parents 4bf4808 + f4835d5
commit aa68a21
Show file tree

Hide file tree

Showing 19 changed files with 460 additions and 284 deletions.
diff --git a/qiita_db/handlers/tests/test_prep_template.py b/qiita_db/handlers/tests/test_prep_template.py
@@ -46,14 +46,18 @@ def test_get(self):
         path_builder = partial(join, db_test_template_dir)
 
         obs = loads(obs.body)
-        exp = {'data_type': '18S',
-               'artifact': 1,
-               'investigation_type': 'Metagenomics',
-               'study': 1,
-               'status': 'private',
-               'qiime-map': path_builder('1_prep_1_qiime_19700101-000000.txt'),
-               'prep-file': path_builder('1_prep_1_19700101-000000.txt')}
-        self.assertEqual(obs, exp)
+
+        # have to check per key because since patch 51 we are updating the
+        # test info files
+        self.assertEqual(obs['data_type'], '18S')
+        self.assertEqual(obs['artifact'], 1)
+        self.assertEqual(obs['investigation_type'], 'Metagenomics')
+        self.assertEqual(obs['study'], 1)
+        self.assertEqual(obs['status'], 'private')
+        self.assertTrue(obs['qiime-map'].startswith(
+            path_builder('1_prep_1_qiime_')))
+        self.assertTrue(obs['prep-file'].startswith(
+            path_builder('1_prep_1_')))
 
 
 class PrepTemplateDataHandlerTests(OauthTestingBase):

diff --git a/qiita_db/metadata_template/base_metadata_template.py b/qiita_db/metadata_template/base_metadata_template.py
@@ -1430,7 +1430,11 @@ def validate(self, restriction_dict):
             else:
                 valid_null = qdb.metadata_template.constants.EBI_NULL_VALUES
                 for column, datatype in viewitems(restriction.columns):
-                    for sample, val in viewitems(self.get_category(column)):
+                    # sorting by key (sample id) so we always check in the
+                    # same order, helpful for testing
+                    cats_by_column = self.get_category(column)
+                    for sample in sorted(cats_by_column):
+                        val = cats_by_column[sample]
                         # ignore if valid null value
                         if val in valid_null:
                             continue
@@ -1439,11 +1443,8 @@ def validate(self, restriction_dict):
                             val = str(val)
                             formats = [
                                 # 4 digits year
-                                '%m/%d/%Y %H:%M:%S', '%m/%d/%Y %H:%M',
-                                '%m/%d/%Y %H', '%m/%d/%Y', '%m/%Y', '%Y',
-                                # 2 digits year
-                                '%m/%d/%y %H:%M:%S', '%m/%d/%y %H:%M',
-                                '%m/%d/%y %H', '%m/%d/%y', '%m/%y', '%y'
+                                '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M',
+                                '%Y-%m-%d %H', '%Y-%m-%d', '%Y-%m', '%Y'
                                 ]
                             date = None
                             for fmt in formats:

diff --git a/qiita_db/metadata_template/test/test_prep_template.py b/qiita_db/metadata_template/test/test_prep_template.py
@@ -917,6 +917,8 @@ def test_create_qiime_mapping_file(self):
         exp = pd.read_csv(
             exp_fp, sep='\t', infer_datetime_format=False,
             parse_dates=False, index_col=False, comment='\t')
+        obs = obs.reindex_axis(sorted(obs.columns), axis=1)
+        exp = exp.reindex_axis(sorted(exp.columns), axis=1)
 
         assert_frame_equal(obs, exp)
 

diff --git a/qiita_db/metadata_template/test/test_sample_template.py b/qiita_db/metadata_template/test/test_sample_template.py
diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py
diff --git a/qiita_db/support_files/patches/51.sql b/qiita_db/support_files/patches/51.sql
@@ -0,0 +1,5 @@
+-- Feb 9, 2017
+-- changing format of stored timestamps
+-- see python patch
+
+SELECT 1;
diff --git a/qiita_db/support_files/patches/python_patches/51.py b/qiita_db/support_files/patches/python_patches/51.py
@@ -0,0 +1,107 @@
+from future.utils import viewitems
+from datetime import datetime
+
+from qiita_db.metadata_template.constants import (
+    SAMPLE_TEMPLATE_COLUMNS, PREP_TEMPLATE_COLUMNS,
+    PREP_TEMPLATE_COLUMNS_TARGET_GENE)
+from qiita_db.metadata_template.prep_template import PrepTemplate
+from qiita_db.metadata_template.sample_template import SampleTemplate
+from qiita_db.sql_connection import TRN
+
+
+# getting columns in each info file that we need to check for
+cols_sample = [col
+               for key, vals in viewitems(SAMPLE_TEMPLATE_COLUMNS)
+               for col, dt in viewitems(vals.columns) if dt == datetime]
+cols_prep = [col
+             for key, vals in viewitems(PREP_TEMPLATE_COLUMNS)
+             for col, dt in viewitems(vals.columns) if dt == datetime].extend(
+                [col
+                 for key, vals in viewitems(PREP_TEMPLATE_COLUMNS_TARGET_GENE)
+                 for col, dt in viewitems(vals.columns)])
+
+
+def transform_date(value):
+    # for the way the patches are applied we need to have this import and
+    # the next 2 variables within this function
+    from datetime import datetime
+
+    # old format : new format
+    formats = {
+        # 4 digits year
+        '%m/%d/%Y %H:%M:%S': '%Y-%m-%d %H:%M:%S',
+        '%m-%d-%Y %H:%M': '%Y-%m-%d %H:%M',
+        '%m/%d/%Y %H': '%Y-%m-%d %H',
+        '%m-%d-%Y': '%Y-%m-%d',
+        '%m-%Y': '%Y-%m',
+        '%Y': '%Y',
+        # 2 digits year
+        '%m/%d/%y %H:%M:%S': '%Y-%m-%d %H:%M:%S',
+        '%m-%d-%y %H:%M': '%Y-%m-%d %H:%M',
+        '%m/%d/%y %H': '%Y-%m-%d %H',
+        '%m-%d-%y': '%Y-%m-%d',
+        '%m-%y': '%Y-%m',
+        '%y': '%Y'
+    }
+
+    # loop over the old formats to see which one is it
+    date = None
+    for i, fmt in enumerate(formats):
+        try:
+            date = datetime.strptime(value, fmt)
+            break
+        except ValueError:
+            pass
+    if date is not None:
+        value = date.strftime(formats[fmt])
+    return value
+
+if cols_sample:
+    with TRN:
+        # a few notes: just getting the preps with duplicated values; ignoring
+        # column 'sample_id' and tables 'study_sample', 'prep_template',
+        # 'prep_template_sample'
+        sql = """SELECT table_name, array_agg(column_name::text)
+                    FROM information_schema.columns
+                    WHERE column_name IN %s
+                        AND table_name LIKE 'sample_%%'
+                        AND table_name NOT IN (
+                            'prep_template', 'prep_template_sample')
+                    GROUP BY table_name"""
+        # note that we are looking for those columns with duplicated names in
+        # the headers
+        TRN.add(sql, [tuple(set(cols_sample))])
+        for table, columns in viewitems(dict(TRN.execute_fetchindex())):
+            # [1] the format is table_# so taking the #
+            st = SampleTemplate(int(table.split('_')[1]))
+            # getting just the columns of interest
+            st_df = st.to_dataframe()[columns]
+            # converting to datetime
+            for col in columns:
+                st_df[col] = st_df[col].apply(transform_date)
+            st.update(st_df)
+
+if cols_prep:
+    with TRN:
+        # a few notes: just getting the preps with duplicated values; ignoring
+        # column 'sample_id' and tables 'study_sample', 'prep_template',
+        # 'prep_template_sample'
+        sql = """SELECT table_name, array_agg(column_name::text)
+                    FROM information_schema.columns
+                    WHERE column_name IN %s
+                        AND table_name LIKE 'prep_%%'
+                        AND table_name NOT IN (
+                            'prep_template', 'prep_template_sample')
+                    GROUP BY table_name"""
+        # note that we are looking for those columns with duplicated names in
+        # the headers
+        TRN.add(sql, [tuple(set(cols_prep))])
+        for table, columns in viewitems(dict(TRN.execute_fetchindex())):
+            # [1] the format is table_# so taking the #
+            pt = PrepTemplate(int(table.split('_')[1]))
+            # getting just the columns of interest
+            pt_df = pt.to_dataframe()[columns]
+            # converting to datetime
+            for col in columns:
+                pt_df[col] = pt_df[col].apply(transform_date)
+            pt.update(pt_df)
diff --git a/qiita_db/support_files/test_data/analysis/1_analysis_mapping_exp.txt b/qiita_db/support_files/test_data/analysis/1_analysis_mapping_exp.txt
@@ -1,4 +1,4 @@
-#SampleID	BarcodeSequence	LinkerPrimerSequence	center_name	center_project_name	emp_status	experiment_center	experiment_design_description	experiment_title	illumina_technology	instrument_model	library_construction_protocol	pcr_primers	platform	run_center	run_date	run_prefix	samp_size	sample_center	sequencing_meth	study_center	target_gene	target_subfragment	qiita_prep_id	altitude	anonymized_name	assigned_from_geo	collection_timestamp	common_name	country	depth	description_duplicate	elevation	env_biome	env_feature	has_extracted_data	has_physical_specimen	host_subject_id	host_taxid	latitude	longitude	ph	physical_location	samp_salinity	sample_type	season_environment	taxon_id	temp	texture	tot_nitro	tot_org_carb	water_content_soil	qiita_study_title	qiita_study_alias	qiita_owner	qiita_principal_investigator	Description
-1.SKB8.640193	AGCGCTCACATC	GTGCCAGCMGCCGCGGTAA	ANL		EMP	ANL	micro biome of soil and rhizosphere of cannabis plants from CA	Cannabis Soil Microbiome	MiSeq	Illumina MiSeq	This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.	FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT	Illumina	ANL	8/1/12	s_G1_L001_sequences	.25,g	ANL	Sequencing by synthesis	CCME	16S rRNA	V4	1	0.0	SKB8	n	2011-11-11 13:00:00	root metagenome	GAZ:United States of America	0.15	Burmese root	114.0	ENVO:Temperate grasslands, savannas, and shrubland biome	ENVO:plant-associated habitat	True	True	1001:M7	3483	74.0894932572	65.3283470202	6.94	ANL	7.15	ENVO:soil	winter	1118232	15.0	64.6 sand, 17.6 silt, 17.8 clay	1.41	5.0	0.164	Identification of the Microbiomes for Cannabis Soils	Cannabis Soils	Dude	PIDude	Cannabis Soil Microbiome
-1.SKD8.640184	TGAGTGGTCTGT	GTGCCAGCMGCCGCGGTAA	ANL		EMP	ANL	micro biome of soil and rhizosphere of cannabis plants from CA	Cannabis Soil Microbiome	MiSeq	Illumina MiSeq	This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.	FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT	Illumina	ANL	8/1/12	s_G1_L001_sequences	.25,g	ANL	Sequencing by synthesis	CCME	16S rRNA	V4	1	0.0	SKD8	n	2011-11-11 13:00:00	root metagenome	GAZ:United States of America	0.15	Diesel Root	114.0	ENVO:Temperate grasslands, savannas, and shrubland biome	ENVO:plant-associated habitat	True	True	1001:D9	3483	57.571893782	32.5563076447	6.8	ANL	7.1	ENVO:soil	winter	1118232	15.0	66 sand, 16.3 silt, 17.7 clay	1.51	4.32	0.178	Identification of the Microbiomes for Cannabis Soils	Cannabis Soils	Dude	PIDude	Cannabis Soil Microbiome
-1.SKB7.640196	CGGCCTAAGTTC	GTGCCAGCMGCCGCGGTAA	ANL		EMP	ANL	micro biome of soil and rhizosphere of cannabis plants from CA	Cannabis Soil Microbiome	MiSeq	Illumina MiSeq	This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.	FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT	Illumina	ANL	8/1/12	s_G1_L001_sequences	.25,g	ANL	Sequencing by synthesis	CCME	16S rRNA	V4	1	0.0	SKB7	n	2011-11-11 13:00:00	root metagenome	GAZ:United States of America	0.15	Burmese root	114.0	ENVO:Temperate grasslands, savannas, and shrubland biome	ENVO:plant-associated habitat	True	True	1001:M8	3483	13.089194595	92.5274472082	6.94	ANL	7.15	ENVO:soil	winter	1118232	15.0	64.6 sand, 17.6 silt, 17.8 clay	1.41	5.0	0.164	Identification of the Microbiomes for Cannabis Soils	Cannabis Soils	Dude	PIDude	Cannabis Soil Microbiome
+#SampleID	BarcodeSequence	LinkerPrimerSequence	center_name	emp_status	experiment_center	experiment_design_description	experiment_title	illumina_technology	instrument_model	library_construction_protocol	pcr_primers	platform	run_center	run_date	run_prefix	samp_size	sample_center	sequencing_meth	study_center	target_gene	target_subfragment	qiita_prep_id	altitude	anonymized_name	assigned_from_geo	collection_timestamp	common_name	country	depth	description_duplicate	dna_extracted	elevation	env_biome	env_feature	host_subject_id	host_taxid	latitude	longitude	ph	physical_specimen_location	physical_specimen_remaining	qiita_study_id	samp_salinity	sample_type	scientific_name	season_environment	taxon_id	temp	texture	tot_nitro	tot_org_carb	water_content_soil	qiita_study_title	qiita_study_alias	qiita_owner	qiita_principal_investigator	Description
+1.SKB8.640193	AGCGCTCACATC	GTGCCAGCMGCCGCGGTAA	ANL	EMP	ANL	micro biome of soil and rhizosphere of cannabis plants from CA	Cannabis Soil Microbiome	MiSeq	Illumina MiSeq	This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.	FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT	Illumina	ANL	8/1/12	s_G1_L001_sequences	.25,g	ANL	Sequencing by synthesis	CCME	16S rRNA	V4	1	0	SKB8	n	2011-11-11 13:00:00	root metagenome	GAZ:United States of America	0.15	Burmese root	true	114	ENVO:Temperate grasslands, savannas, and shrubland biome	ENVO:plant-associated habitat	1001:M7	3483	74.0894932572	65.3283470202	6.94	ANL	true	1	7.15	ENVO:soil	1118232	winter	1118232	15	64.6 sand, 17.6 silt, 17.8 clay	1.41	5	0.164	Identification of the Microbiomes for Cannabis Soils	Cannabis Soils	Dude	PIDude	Cannabis Soil Microbiome
+1.SKD8.640184	TGAGTGGTCTGT	GTGCCAGCMGCCGCGGTAA	ANL	EMP	ANL	micro biome of soil and rhizosphere of cannabis plants from CA	Cannabis Soil Microbiome	MiSeq	Illumina MiSeq	This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.	FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT	Illumina	ANL	8/1/12	s_G1_L001_sequences	.25,g	ANL	Sequencing by synthesis	CCME	16S rRNA	V4	1	0	SKD8	n	2011-11-11 13:00:00	root metagenome	GAZ:United States of America	0.15	Diesel Root	true	114	ENVO:Temperate grasslands, savannas, and shrubland biome	ENVO:plant-associated habitat	1001:D9	3483	57.571893782	32.5563076447	6.8	ANL	true	1	7.1	ENVO:soil	1118232	winter	1118232	15	66 sand, 16.3 silt, 17.7 clay	1.51	4.32	0.178	Identification of the Microbiomes for Cannabis Soils	Cannabis Soils	Dude	PIDude	Cannabis Soil Microbiome
+1.SKB7.640196	CGGCCTAAGTTC	GTGCCAGCMGCCGCGGTAA	ANL	EMP	ANL	micro biome of soil and rhizosphere of cannabis plants from CA	Cannabis Soil Microbiome	MiSeq	Illumina MiSeq	This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.	FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT	Illumina	ANL	8/1/12	s_G1_L001_sequences	.25,g	ANL	Sequencing by synthesis	CCME	16S rRNA	V4	1	0	SKB7	n	2011-11-11 13:00:00	root metagenome	GAZ:United States of America	0.15	Burmese root	true	114	ENVO:Temperate grasslands, savannas, and shrubland biome	ENVO:plant-associated habitat	1001:M8	3483	13.089194595	92.5274472082	6.94	ANL	true	1	7.15	ENVO:soil	1118232	winter	1118232	15	64.6 sand, 17.6 silt, 17.8 clay	1.41	5	0.164	Identification of the Microbiomes for Cannabis Soils	Cannabis Soils	Dude	PIDude	Cannabis Soil Microbiome