Skip to content

Commit

Permalink
Merge pull request #2075 from antgonza/fix-timestamp
Browse files Browse the repository at this point in the history
Fix timestamp / -> -
  • Loading branch information
wasade committed Feb 15, 2017
2 parents 4bf4808 + f4835d5 commit aa68a21
Show file tree
Hide file tree
Showing 19 changed files with 460 additions and 284 deletions.
20 changes: 12 additions & 8 deletions qiita_db/handlers/tests/test_prep_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,18 @@ def test_get(self):
path_builder = partial(join, db_test_template_dir)

obs = loads(obs.body)
exp = {'data_type': '18S',
'artifact': 1,
'investigation_type': 'Metagenomics',
'study': 1,
'status': 'private',
'qiime-map': path_builder('1_prep_1_qiime_19700101-000000.txt'),
'prep-file': path_builder('1_prep_1_19700101-000000.txt')}
self.assertEqual(obs, exp)

# have to check per key because since patch 51 we are updating the
# test info files
self.assertEqual(obs['data_type'], '18S')
self.assertEqual(obs['artifact'], 1)
self.assertEqual(obs['investigation_type'], 'Metagenomics')
self.assertEqual(obs['study'], 1)
self.assertEqual(obs['status'], 'private')
self.assertTrue(obs['qiime-map'].startswith(
path_builder('1_prep_1_qiime_')))
self.assertTrue(obs['prep-file'].startswith(
path_builder('1_prep_1_')))


class PrepTemplateDataHandlerTests(OauthTestingBase):
Expand Down
13 changes: 7 additions & 6 deletions qiita_db/metadata_template/base_metadata_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -1430,7 +1430,11 @@ def validate(self, restriction_dict):
else:
valid_null = qdb.metadata_template.constants.EBI_NULL_VALUES
for column, datatype in viewitems(restriction.columns):
for sample, val in viewitems(self.get_category(column)):
# sorting by key (sample id) so we always check in the
# same order, helpful for testing
cats_by_column = self.get_category(column)
for sample in sorted(cats_by_column):
val = cats_by_column[sample]
# ignore if valid null value
if val in valid_null:
continue
Expand All @@ -1439,11 +1443,8 @@ def validate(self, restriction_dict):
val = str(val)
formats = [
# 4 digits year
'%m/%d/%Y %H:%M:%S', '%m/%d/%Y %H:%M',
'%m/%d/%Y %H', '%m/%d/%Y', '%m/%Y', '%Y',
# 2 digits year
'%m/%d/%y %H:%M:%S', '%m/%d/%y %H:%M',
'%m/%d/%y %H', '%m/%d/%y', '%m/%y', '%y'
'%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M',
'%Y-%m-%d %H', '%Y-%m-%d', '%Y-%m', '%Y'
]
date = None
for fmt in formats:
Expand Down
2 changes: 2 additions & 0 deletions qiita_db/metadata_template/test/test_prep_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,6 +917,8 @@ def test_create_qiime_mapping_file(self):
exp = pd.read_csv(
exp_fp, sep='\t', infer_datetime_format=False,
parse_dates=False, index_col=False, comment='\t')
obs = obs.reindex_axis(sorted(obs.columns), axis=1)
exp = exp.reindex_axis(sorted(exp.columns), axis=1)

assert_frame_equal(obs, exp)

Expand Down
180 changes: 102 additions & 78 deletions qiita_db/metadata_template/test/test_sample_template.py

Large diffs are not rendered by default.

134 changes: 67 additions & 67 deletions qiita_db/metadata_template/test/test_util.py

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions qiita_db/support_files/patches/51.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- Feb 9, 2017
-- changing format of stored timestamps
-- see python patch

SELECT 1;
107 changes: 107 additions & 0 deletions qiita_db/support_files/patches/python_patches/51.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from future.utils import viewitems
from datetime import datetime

from qiita_db.metadata_template.constants import (
SAMPLE_TEMPLATE_COLUMNS, PREP_TEMPLATE_COLUMNS,
PREP_TEMPLATE_COLUMNS_TARGET_GENE)
from qiita_db.metadata_template.prep_template import PrepTemplate
from qiita_db.metadata_template.sample_template import SampleTemplate
from qiita_db.sql_connection import TRN


# getting columns in each info file that we need to check for
cols_sample = [col
for key, vals in viewitems(SAMPLE_TEMPLATE_COLUMNS)
for col, dt in viewitems(vals.columns) if dt == datetime]
cols_prep = [col
for key, vals in viewitems(PREP_TEMPLATE_COLUMNS)
for col, dt in viewitems(vals.columns) if dt == datetime].extend(
[col
for key, vals in viewitems(PREP_TEMPLATE_COLUMNS_TARGET_GENE)
for col, dt in viewitems(vals.columns)])


def transform_date(value):
# for the way the patches are applied we need to have this import and
# the next 2 variables within this function
from datetime import datetime

# old format : new format
formats = {
# 4 digits year
'%m/%d/%Y %H:%M:%S': '%Y-%m-%d %H:%M:%S',
'%m-%d-%Y %H:%M': '%Y-%m-%d %H:%M',
'%m/%d/%Y %H': '%Y-%m-%d %H',
'%m-%d-%Y': '%Y-%m-%d',
'%m-%Y': '%Y-%m',
'%Y': '%Y',
# 2 digits year
'%m/%d/%y %H:%M:%S': '%Y-%m-%d %H:%M:%S',
'%m-%d-%y %H:%M': '%Y-%m-%d %H:%M',
'%m/%d/%y %H': '%Y-%m-%d %H',
'%m-%d-%y': '%Y-%m-%d',
'%m-%y': '%Y-%m',
'%y': '%Y'
}

# loop over the old formats to see which one is it
date = None
for i, fmt in enumerate(formats):
try:
date = datetime.strptime(value, fmt)
break
except ValueError:
pass
if date is not None:
value = date.strftime(formats[fmt])
return value

if cols_sample:
with TRN:
# a few notes: just getting the preps with duplicated values; ignoring
# column 'sample_id' and tables 'study_sample', 'prep_template',
# 'prep_template_sample'
sql = """SELECT table_name, array_agg(column_name::text)
FROM information_schema.columns
WHERE column_name IN %s
AND table_name LIKE 'sample_%%'
AND table_name NOT IN (
'prep_template', 'prep_template_sample')
GROUP BY table_name"""
# note that we are looking for those columns with duplicated names in
# the headers
TRN.add(sql, [tuple(set(cols_sample))])
for table, columns in viewitems(dict(TRN.execute_fetchindex())):
# [1] the format is table_# so taking the #
st = SampleTemplate(int(table.split('_')[1]))
# getting just the columns of interest
st_df = st.to_dataframe()[columns]
# converting to datetime
for col in columns:
st_df[col] = st_df[col].apply(transform_date)
st.update(st_df)

if cols_prep:
with TRN:
# a few notes: just getting the preps with duplicated values; ignoring
# column 'sample_id' and tables 'study_sample', 'prep_template',
# 'prep_template_sample'
sql = """SELECT table_name, array_agg(column_name::text)
FROM information_schema.columns
WHERE column_name IN %s
AND table_name LIKE 'prep_%%'
AND table_name NOT IN (
'prep_template', 'prep_template_sample')
GROUP BY table_name"""
# note that we are looking for those columns with duplicated names in
# the headers
TRN.add(sql, [tuple(set(cols_prep))])
for table, columns in viewitems(dict(TRN.execute_fetchindex())):
# [1] the format is table_# so taking the #
pt = PrepTemplate(int(table.split('_')[1]))
# getting just the columns of interest
pt_df = pt.to_dataframe()[columns]
# converting to datetime
for col in columns:
pt_df[col] = pt_df[col].apply(transform_date)
pt.update(pt_df)
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#SampleID BarcodeSequence LinkerPrimerSequence center_name center_project_name emp_status experiment_center experiment_design_description experiment_title illumina_technology instrument_model library_construction_protocol pcr_primers platform run_center run_date run_prefix samp_size sample_center sequencing_meth study_center target_gene target_subfragment qiita_prep_id altitude anonymized_name assigned_from_geo collection_timestamp common_name country depth description_duplicate elevation env_biome env_feature has_extracted_data has_physical_specimen host_subject_id host_taxid latitude longitude ph physical_location samp_salinity sample_type season_environment taxon_id temp texture tot_nitro tot_org_carb water_content_soil qiita_study_title qiita_study_alias qiita_owner qiita_principal_investigator Description
1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome
1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL 7.1 ENVO:soil winter 1118232 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome
1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome
#SampleID BarcodeSequence LinkerPrimerSequence center_name emp_status experiment_center experiment_design_description experiment_title illumina_technology instrument_model library_construction_protocol pcr_primers platform run_center run_date run_prefix samp_size sample_center sequencing_meth study_center target_gene target_subfragment qiita_prep_id altitude anonymized_name assigned_from_geo collection_timestamp common_name country depth description_duplicate dna_extracted elevation env_biome env_feature host_subject_id host_taxid latitude longitude ph physical_specimen_location physical_specimen_remaining qiita_study_id samp_salinity sample_type scientific_name season_environment taxon_id temp texture tot_nitro tot_org_carb water_content_soil qiita_study_title qiita_study_alias qiita_owner qiita_principal_investigator Description
1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL true 1 7.15 ENVO:soil 1118232 winter 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5 0.164 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome
1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL true 1 7.1 ENVO:soil 1118232 winter 1118232 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome
1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL true 1 7.15 ENVO:soil 1118232 winter 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5 0.164 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome
Loading

0 comments on commit aa68a21

Please sign in to comment.