Skip to content

Commit

Permalink
Merge pull request #194 from squirrelo/add-data-type-to-preproc-proc-…
Browse files Browse the repository at this point in the history
…data

Add data_type_id column to preprocessed and processed data tables
  • Loading branch information
ElDeveloper committed Jul 16, 2014
2 parents 44169c3 + f8147d8 commit 8aea0d2
Show file tree
Hide file tree
Showing 7 changed files with 471 additions and 219 deletions.
138 changes: 115 additions & 23 deletions qiita_db/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
from .base import QiitaObject
from .sql_connection import SQLConnectionHandler
from .util import (exists_dynamic_table, get_db_files_base_dir,
insert_filepaths)
insert_filepaths, convert_to_id)


class BaseData(QiitaObject):
Expand Down Expand Up @@ -190,6 +190,7 @@ class RawData(BaseData):
Methods
-------
create
data_type
See Also
--------
Expand Down Expand Up @@ -253,6 +254,27 @@ def studies(self):
[self._id])
return [id[0] for id in ids]

def data_type(self, ret_id=False):
"""Returns the data_type or data_type_id
Parameters
----------
ret_id : bool, optional
Return the id instead of the string, default False
Returns
-------
str or int
string value of data_type or int if data_type_id
"""
ret = "_id" if ret_id else ""
conn_handler = SQLConnectionHandler()
data_type = conn_handler.execute_fetchone(
"SELECT d.data_type{0} FROM qiita.data_type d JOIN "
"qiita.common_prep_info c ON c.data_type_id = d.data_type_id WHERE"
" c.raw_data_id = %s".format(ret), (self._id, ))
return data_type[0]


class PreprocessedData(BaseData):
r"""Object for dealing with preprocessed data
Expand All @@ -266,6 +288,7 @@ class PreprocessedData(BaseData):
-------
create
is_submitted_to_insdc
data_type
See Also
--------
Expand All @@ -280,7 +303,8 @@ class PreprocessedData(BaseData):

@classmethod
def create(cls, study, preprocessed_params_table, preprocessed_params_id,
filepaths, raw_data=None, submitted_to_insdc=False):
filepaths, raw_data=None, data_type=None,
submitted_to_insdc=False):
r"""Creates a new object with a new id on the storage system
Parameters
Expand All @@ -299,14 +323,31 @@ def create(cls, study, preprocessed_params_table, preprocessed_params_id,
If true, the raw data files have been submitted to insdc
raw_data : RawData, optional
The RawData object used as base to this preprocessed data
data_type : str, optional
The data_type of the preprocessed_data
Raises
------
IncompetentQiitaDeveloperError
If the table `preprocessed_params_table` does not exists
IncompetentQiitaDeveloperError
If data_type does not match that of raw_data passed
"""
conn_handler = SQLConnectionHandler()
# We first check that the preprocessed_params_table exists
if (data_type and raw_data) and data_type != raw_data.data_type:
raise IncompetentQiitaDeveloperError(
"data_type passed does not match raw_data data_type!")
elif data_type is None and raw_data is None:
raise IncompetentQiitaDeveloperError("Neither data_type nor "
"raw_data passed!")
elif raw_data:
# raw_data passed but no data_type, so set to raw data data_type
data_type = raw_data.data_type(ret_id=True)
else:
# only data_type, so need id from the text
data_type = convert_to_id(data_type, "data_type", conn_handler)
# Check that the preprocessed_params_table exists
if not exists_dynamic_table(preprocessed_params_table, "preprocessed_",
"_params", conn_handler):
raise IncompetentQiitaDeveloperError(
Expand All @@ -316,12 +357,13 @@ def create(cls, study, preprocessed_params_table, preprocessed_params_id,
# and get the preprocessed data id back
ppd_id = conn_handler.execute_fetchone(
"INSERT INTO qiita.{0} (preprocessed_params_table, "
"preprocessed_params_id, submitted_to_insdc) VALUES "
"(%(param_table)s, %(param_id)s, %(insdc)s) "
"preprocessed_params_id, submitted_to_insdc, data_type_id) VALUES "
"(%(param_table)s, %(param_id)s, %(insdc)s, %(data_type)s) "
"RETURNING preprocessed_data_id".format(cls._table),
{'param_table': preprocessed_params_table,
'param_id': preprocessed_params_id,
'insdc': submitted_to_insdc})[0]
'insdc': submitted_to_insdc,
'data_type': data_type})[0]
ppd = cls(ppd_id)

# Connect the preprocessed data with its study
Expand Down Expand Up @@ -363,6 +405,28 @@ def study(self):
"preprocessed_data_id=%s".format(self._study_preprocessed_table),
[self._id])[0]

def data_type(self, ret_id=False):
"""Returns the data_type or data_type_id
Parameters
----------
ret_id : bool, optional
Return the id instead of the string, default False
Returns
-------
str or int
string value of data_type or data_type_id
"""
conn_handler = SQLConnectionHandler()
ret = "_id" if ret_id else ""
data_type = conn_handler.execute_fetchone(
"SELECT d.data_type{0} FROM qiita.data_type d JOIN "
"qiita.{1} p ON p.data_type_id = d.data_type_id WHERE"
" p.preprocessed_data_id = %s".format(ret, self._table),
(self._id, ))
return data_type[0]

def is_submitted_to_insdc(self):
r"""Tells if the raw data has been submitted to insdc
Expand All @@ -387,6 +451,7 @@ class ProcessedData(BaseData):
Methods
-------
create
data_type
See Also
--------
Expand All @@ -401,7 +466,8 @@ class ProcessedData(BaseData):

@classmethod
def create(cls, processed_params_table, processed_params_id, filepaths,
preprocessed_data=None, study=None, processed_date=None):
preprocessed_data=None, study=None, processed_date=None,
data_type=None):
r"""
Parameters
----------
Expand All @@ -420,6 +486,9 @@ def create(cls, processed_params_table, processed_params_id, filepaths,
belongs to
processed_date : datetime, optional
Date in which the data have been processed. Default: now
data_type : str, optional
data_type of the processed_data. Otherwise taken from passed
preprocessed_data.
Raises
------
Expand All @@ -428,17 +497,30 @@ def create(cls, processed_params_table, processed_params_id, filepaths,
If `preprocessed_data` and `study` are provided at the same time
If `preprocessed_data` and `study` are not provided
"""
conn_handler = SQLConnectionHandler()
if preprocessed_data is not None:
if study is not None:
raise IncompetentQiitaDeveloperError(
"You should provide either preprocessed_data or study, "
"but not both")
elif data_type is not None and \
data_type != preprocessed_data.data_type():
raise IncompetentQiitaDeveloperError(
"data_type passed does not match preprocessed_data "
"data_type!")
else:
data_type = preprocessed_data.data_type(ret_id=True)
else:
if study is None:
raise IncompetentQiitaDeveloperError(
"You should provide either a preprocessed_data or a study")
if data_type is None:
raise IncompetentQiitaDeveloperError(
"You must provide either a preprocessed_data, a "
"data_type, or both")
else:
data_type = convert_to_id(data_type, "data_type", conn_handler)

conn_handler = SQLConnectionHandler()
# We first check that the processed_params_table exists
if not exists_dynamic_table(processed_params_table,
"processed_params_", "", conn_handler):
Expand All @@ -454,12 +536,13 @@ def create(cls, processed_params_table, processed_params_id, filepaths,
# and get the processed data id back
pd_id = conn_handler.execute_fetchone(
"INSERT INTO qiita.{0} (processed_params_table, "
"processed_params_id, processed_date) VALUES (%(param_table)s, "
"%(param_id)s, %(date)s) RETURNING "
"processed_data_id".format(cls._table),
"processed_params_id, processed_date, data_type_id) VALUES ("
"%(param_table)s, %(param_id)s, %(date)s, %(data_type)s) RETURNING"
" processed_data_id".format(cls._table),
{'param_table': processed_params_table,
'param_id': processed_params_id,
'date': processed_date})[0]
'date': processed_date,
'data_type': data_type})[0]

pd = cls(pd_id)

Expand Down Expand Up @@ -491,15 +574,24 @@ def preprocessed_data(self):
"processed_data_id=%s".format(self._preprocessed_processed_table),
[self._id])[0]

@property
def data_type(self):
r"""The data_type of the data used"""
def data_type(self, ret_id=False):
"""Returns the data_type or data_type_id
Parameters
----------
ret_id : bool, optional
Return the id instead of the string, default False
Returns
-------
str or int
string value of data_type or data_type_id
"""
conn_handler = SQLConnectionHandler()
sql = ("SELECT DISTINCT DT.data_type FROM "
"qiita.preprocessed_processed_data PPD JOIN "
"qiita.raw_preprocessed_data RPD on PPD.preprocessed_data_id = "
"RPD.preprocessed_data_id JOIN qiita.common_prep_info CPI ON "
"RPD.raw_data_id = CPI.raw_data_id JOIN qiita.data_type DT ON "
"CPI.data_type_id = DT.data_type_id WHERE "
"PPD.processed_data_id = %s")
return conn_handler.execute_fetchone(sql, [self._id])[0]
ret = "_id" if ret_id else ""
data_type = conn_handler.execute_fetchone(
"SELECT d.data_type{0} FROM qiita.data_type d JOIN "
"qiita.{1} p ON p.data_type_id = d.data_type_id WHERE"
" p.processed_data_id = %s".format(ret, self._table),
(self._id, ))
return data_type[0]
6 changes: 3 additions & 3 deletions qiita_db/sql_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,10 @@ def _sql_executor(self, sql, sql_args=None, many=False):
self._connection.commit()
except PostgresError as e:
self._connection.rollback()
if sql_args and isinstance(sql_args[0], Iterable):
err_sql = cur.mogrify(sql, sql_args[0])
else:
try:
err_sql = cur.mogrify(sql, sql_args)
except ValueError:
err_sql = cur.mogrify(sql, sql_args[0])
raise QiitaDBExecutionError(("\nError running SQL query: %s"
"\nError: %s" % (err_sql, e)))

Expand Down
4 changes: 2 additions & 2 deletions qiita_db/support_files/populate_test_db.sql
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ INSERT INTO qiita.prep_1 (sample_id, BarcodeSequence, LIBRARY_CONSTRUCTION_PROTO
('SKM9.640192', 'AGCAGGCACGAA', 'This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.', 'GTGCCAGCMGCCGCGGTAA', 'V4', '16S rRNA', 'ANL', 's_G1_L001_sequences', '8/1/12', 'ANL', 'micro biome of soil and rhizosphere of cannabis plants from CA', 'Cannabis Soil Microbiome', 'Illumina', '.25,g', 'Sequencing by synthesis', 'MiSeq', 'ANL', 'FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT', 'CCME');

-- Insert preprocessed information for raw data 1
INSERT INTO qiita.preprocessed_data (preprocessed_params_table, preprocessed_params_id, submitted_to_insdc) VALUES ('preprocessed_sequence_illumina_params', 1, TRUE), ('preprocessed_sequence_illumina_params', 2, FALSE);
INSERT INTO qiita.preprocessed_data (preprocessed_params_table, preprocessed_params_id, submitted_to_insdc, data_type_id) VALUES ('preprocessed_sequence_illumina_params', 1, TRUE, 2), ('preprocessed_sequence_illumina_params', 2, FALSE, 2);

-- Link the new preprocessed data with the raw data
INSERT INTO qiita.raw_preprocessed_data (raw_data_id, preprocessed_data_id) VALUES (1, 1), (1, 2);
Expand All @@ -306,7 +306,7 @@ INSERT INTO qiita.preprocessed_filepath (preprocessed_data_id, filepath_id) VALU
INSERT INTO qiita.preprocessed_sequence_illumina_params (trim_length) VALUES (151), (100);

-- Insert processed information for study 0 and processed data 1
INSERT INTO qiita.processed_data (processed_params_table, processed_params_id, processed_date) VALUES ('processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012');
INSERT INTO qiita.processed_data (processed_params_table, processed_params_id, processed_date, data_type_id) VALUES ('processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012', 2);

-- Link the processed data with the preprocessed data
INSERT INTO qiita.preprocessed_processed_data (preprocessed_data_id, processed_data_id) VALUES (1, 1);
Expand Down
20 changes: 17 additions & 3 deletions qiita_db/support_files/qiita-db.dbs
Original file line number Diff line number Diff line change
Expand Up @@ -574,9 +574,16 @@ Linked by y being raw_data_id from raw data table.</comment>
</column>
<column name="preprocessed_params_id" type="bigint" jt="-5" mandatory="y" />
<column name="submitted_to_insdc" type="bool" jt="-7" mandatory="y" />
<column name="data_type_id" type="bigint" jt="-5" mandatory="y" />
<index name="pk_preprocessed_data" unique="PRIMARY_KEY" >
<column name="preprocessed_data_id" />
</index>
<index name="idx_preprocessed_data" unique="NORMAL" >
<column name="data_type_id" />
</index>
<fk name="fk_preprocessed_data" to_schema="qiita" to_table="data_type" >
<fk_column name="data_type_id" pk="data_type_id" />
</fk>
</table>
<table name="preprocessed_filepath" >
<column name="preprocessed_data_id" type="bigint" jt="-5" mandatory="y" />
Expand Down Expand Up @@ -660,9 +667,16 @@ Linked by y being raw_data_id from raw data table.</comment>
<comment><![CDATA[Link to a table with the parameters used to generate processed data]]></comment>
</column>
<column name="processed_date" type="timestamp" jt="93" mandatory="y" />
<column name="data_type_id" type="bigint" jt="-5" mandatory="y" />
<index name="pk_processed_data" unique="PRIMARY_KEY" >
<column name="processed_data_id" />
</index>
<index name="idx_processed_data" unique="NORMAL" >
<column name="data_type_id" />
</index>
<fk name="fk_processed_data" to_schema="qiita" to_table="data_type" >
<fk_column name="data_type_id" pk="data_type_id" />
</fk>
</table>
<table name="processed_filepath" >
<column name="processed_data_id" type="bigint" jt="-5" mandatory="y" />
Expand Down Expand Up @@ -1303,8 +1317,6 @@ Controlled Vocabulary]]></comment>
<entity schema="qiita" name="portal_type" color="c0d4f3" x="1845" y="720" />
<entity schema="qiita" name="raw_data" color="d0def5" x="1230" y="480" />
<entity schema="qiita" name="raw_preprocessed_data" color="b2cdf7" x="1230" y="585" />
<entity schema="qiita" name="preprocessed_filepath" color="c0d4f3" x="990" y="705" />
<entity schema="qiita" name="preprocessed_data" color="c0d4f3" x="1200" y="690" />
<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
<entity schema="qiita" name="command" color="d0def5" x="210" y="1095" />
<entity schema="qiita" name="logging" color="c0d4f3" x="1365" y="1200" />
Expand All @@ -1313,9 +1325,11 @@ Controlled Vocabulary]]></comment>
<entity schema="qiita" name="preprocessed_spectra_params" color="d0def5" x="1830" y="825" />
<entity schema="qiita" name="preprocessed_sequence_454_params" color="c0d4f3" x="1815" y="915" />
<entity schema="qiita" name="preprocessed_sequence_illumina_params" color="d0def5" x="1800" y="1005" />
<entity schema="qiita" name="processed_data" color="d0def5" x="1215" y="930" />
<entity schema="qiita" name="study_processed_data" color="b2cdf7" x="1455" y="930" />
<entity schema="qiita" name="command_data_type" color="c0d4f3" x="390" y="1155" />
<entity schema="qiita" name="preprocessed_data" color="c0d4f3" x="1200" y="690" />
<entity schema="qiita" name="processed_data" color="d0def5" x="1215" y="930" />
<entity schema="qiita" name="preprocessed_filepath" color="c0d4f3" x="990" y="690" />
<group name="Group_analyses" color="c4e0f9" >
<comment>analysis tables</comment>
<entity schema="qiita" name="analysis" />
Expand Down
Loading

0 comments on commit 8aea0d2

Please sign in to comment.