Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add data_type_id column to preprocessed and processed data tables #194

Merged
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 115 additions & 23 deletions qiita_db/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
from .base import QiitaObject
from .sql_connection import SQLConnectionHandler
from .util import (exists_dynamic_table, get_db_files_base_dir,
insert_filepaths)
insert_filepaths, convert_to_id)


class BaseData(QiitaObject):
Expand Down Expand Up @@ -190,6 +190,7 @@ class RawData(BaseData):
Methods
-------
create
data_type

See Also
--------
Expand Down Expand Up @@ -253,6 +254,27 @@ def studies(self):
[self._id])
return [id[0] for id in ids]

def data_type(self, ret_id=False):
"""Returns the data_type or data_type_id

Parameters
----------
ret_id : bool, optional
Return the id instead of the string, default False

Returns
-------
str or int
string value of data_type or data_type_id
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you indicate here when would it be a string and when would it be an int?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@squirrelo, I think you missed this comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, it is in two places. Got the other one you missed and missed this one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, that's weird. It's changed if you look at the file diff but this comment is still here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah gotcha, thanks!

"""
ret = "_id" if ret_id else ""
conn_handler = SQLConnectionHandler()
data_type = conn_handler.execute_fetchone(
"SELECT d.data_type{0} FROM qiita.data_type d JOIN "
"qiita.common_prep_info c ON c.data_type_id = d.data_type_id WHERE"
" c.raw_data_id = %s".format(ret), (self._id, ))
return data_type[0]


class PreprocessedData(BaseData):
r"""Object for dealing with preprocessed data
Expand All @@ -266,6 +288,7 @@ class PreprocessedData(BaseData):
-------
create
is_submitted_to_insdc
data_type

See Also
--------
Expand All @@ -280,7 +303,8 @@ class PreprocessedData(BaseData):

@classmethod
def create(cls, study, preprocessed_params_table, preprocessed_params_id,
filepaths, raw_data=None, submitted_to_insdc=False):
filepaths, raw_data=None, data_type=None,
submitted_to_insdc=False):
r"""Creates a new object with a new id on the storage system

Parameters
Expand All @@ -299,14 +323,31 @@ def create(cls, study, preprocessed_params_table, preprocessed_params_id,
If true, the raw data files have been submitted to insdc
raw_data : RawData, optional
The RawData object used as base to this preprocessed data
data_type : str, optional
The data_type of the preprocessed_data


Raises
------
IncompetentQiitaDeveloperError
If the table `preprocessed_params_table` does not exists
IncompetentQiitaDeveloperError
If data_type does not match that of raw_data passed
"""
conn_handler = SQLConnectionHandler()
# We first check that the preprocessed_params_table exists
if (data_type and raw_data) and data_type != raw_data.data_type:
raise IncompetentQiitaDeveloperError(
"data_type passed does not match raw_data data_type!")
elif data_type is None and raw_data is None:
raise IncompetentQiitaDeveloperError("Neither data_type nor "
"raw_data passed!")
elif raw_data:
# raw_data passed but no data_type, so set to raw data data_type
data_type = raw_data.data_type(ret_id=True)
else:
# only data_type, so need id from the text
data_type = convert_to_id(data_type, "data_type", conn_handler)
# Check that the preprocessed_params_table exists
if not exists_dynamic_table(preprocessed_params_table, "preprocessed_",
"_params", conn_handler):
raise IncompetentQiitaDeveloperError(
Expand All @@ -316,12 +357,13 @@ def create(cls, study, preprocessed_params_table, preprocessed_params_id,
# and get the preprocessed data id back
ppd_id = conn_handler.execute_fetchone(
"INSERT INTO qiita.{0} (preprocessed_params_table, "
"preprocessed_params_id, submitted_to_insdc) VALUES "
"(%(param_table)s, %(param_id)s, %(insdc)s) "
"preprocessed_params_id, submitted_to_insdc, data_type_id) VALUES "
"(%(param_table)s, %(param_id)s, %(insdc)s, %(data_type)s) "
"RETURNING preprocessed_data_id".format(cls._table),
{'param_table': preprocessed_params_table,
'param_id': preprocessed_params_id,
'insdc': submitted_to_insdc})[0]
'insdc': submitted_to_insdc,
'data_type': data_type})[0]
ppd = cls(ppd_id)

# Connect the preprocessed data with its study
Expand Down Expand Up @@ -363,6 +405,28 @@ def study(self):
"preprocessed_data_id=%s".format(self._study_preprocessed_table),
[self._id])[0]

def data_type(self, ret_id=False):
"""Returns the data_type or data_type_id

Parameters
----------
ret_id : bool, optional
Return the id instead of the string, default False

Returns
-------
str or int
string value of data_type or data_type_id
"""
conn_handler = SQLConnectionHandler()
ret = "_id" if ret_id else ""
data_type = conn_handler.execute_fetchone(
"SELECT d.data_type{0} FROM qiita.data_type d JOIN "
"qiita.{1} p ON p.data_type_id = d.data_type_id WHERE"
" p.preprocessed_data_id = %s".format(ret, self._table),
(self._id, ))
return data_type[0]

def is_submitted_to_insdc(self):
r"""Tells if the raw data has been submitted to insdc

Expand All @@ -387,6 +451,7 @@ class ProcessedData(BaseData):
Methods
-------
create
data_type

See Also
--------
Expand All @@ -401,7 +466,8 @@ class ProcessedData(BaseData):

@classmethod
def create(cls, processed_params_table, processed_params_id, filepaths,
preprocessed_data=None, study=None, processed_date=None):
preprocessed_data=None, study=None, processed_date=None,
data_type=None):
r"""
Parameters
----------
Expand All @@ -420,6 +486,9 @@ def create(cls, processed_params_table, processed_params_id, filepaths,
belongs to
processed_date : datetime, optional
Date in which the data have been processed. Default: now
data_type : str, optional
data_type of the processed_data. Otherwise taken from passed
preprocessed_data.

Raises
------
Expand All @@ -428,17 +497,30 @@ def create(cls, processed_params_table, processed_params_id, filepaths,
If `preprocessed_data` and `study` are provided at the same time
If `preprocessed_data` and `study` are not provided
"""
conn_handler = SQLConnectionHandler()
if preprocessed_data is not None:
if study is not None:
raise IncompetentQiitaDeveloperError(
"You should provide either preprocessed_data or study, "
"but not both")
elif data_type is not None and \
data_type != preprocessed_data.data_type():
raise IncompetentQiitaDeveloperError(
"data_type passed does not match preprocessed_data "
"data_type!")
else:
data_type = preprocessed_data.data_type(ret_id=True)
else:
if study is None:
raise IncompetentQiitaDeveloperError(
"You should provide either a preprocessed_data or a study")
if data_type is None:
raise IncompetentQiitaDeveloperError(
"You must provide either a preprocessed_data, a "
"data_type, or both")
else:
data_type = convert_to_id(data_type, "data_type", conn_handler)

conn_handler = SQLConnectionHandler()
# We first check that the processed_params_table exists
if not exists_dynamic_table(processed_params_table,
"processed_params_", "", conn_handler):
Expand All @@ -454,12 +536,13 @@ def create(cls, processed_params_table, processed_params_id, filepaths,
# and get the processed data id back
pd_id = conn_handler.execute_fetchone(
"INSERT INTO qiita.{0} (processed_params_table, "
"processed_params_id, processed_date) VALUES (%(param_table)s, "
"%(param_id)s, %(date)s) RETURNING "
"processed_data_id".format(cls._table),
"processed_params_id, processed_date, data_type_id) VALUES ("
"%(param_table)s, %(param_id)s, %(date)s, %(data_type)s) RETURNING"
" processed_data_id".format(cls._table),
{'param_table': processed_params_table,
'param_id': processed_params_id,
'date': processed_date})[0]
'date': processed_date,
'data_type': data_type})[0]

pd = cls(pd_id)

Expand Down Expand Up @@ -491,15 +574,24 @@ def preprocessed_data(self):
"processed_data_id=%s".format(self._preprocessed_processed_table),
[self._id])[0]

@property
def data_type(self):
r"""The data_type of the data used"""
def data_type(self, ret_id=False):
"""Returns the data_type or data_type_id

Parameters
----------
ret_id : bool, optional
Return the id instead of the string, default False

Returns
-------
str or int
string value of data_type or data_type_id
"""
conn_handler = SQLConnectionHandler()
sql = ("SELECT DISTINCT DT.data_type FROM "
"qiita.preprocessed_processed_data PPD JOIN "
"qiita.raw_preprocessed_data RPD on PPD.preprocessed_data_id = "
"RPD.preprocessed_data_id JOIN qiita.common_prep_info CPI ON "
"RPD.raw_data_id = CPI.raw_data_id JOIN qiita.data_type DT ON "
"CPI.data_type_id = DT.data_type_id WHERE "
"PPD.processed_data_id = %s")
return conn_handler.execute_fetchone(sql, [self._id])[0]
ret = "_id" if ret_id else ""
data_type = conn_handler.execute_fetchone(
"SELECT d.data_type{0} FROM qiita.data_type d JOIN "
"qiita.{1} p ON p.data_type_id = d.data_type_id WHERE"
" p.processed_data_id = %s".format(ret, self._table),
(self._id, ))
return data_type[0]
6 changes: 3 additions & 3 deletions qiita_db/sql_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,10 @@ def _sql_executor(self, sql, sql_args=None, many=False):
self._connection.commit()
except PostgresError as e:
self._connection.rollback()
if sql_args and isinstance(sql_args[0], Iterable):
err_sql = cur.mogrify(sql, sql_args[0])
else:
try:
err_sql = cur.mogrify(sql, sql_args)
except ValueError:
err_sql = cur.mogrify(sql, sql_args[0])
raise QiitaDBExecutionError(("\nError running SQL query: %s"
"\nError: %s" % (err_sql, e)))

Expand Down
4 changes: 2 additions & 2 deletions qiita_db/support_files/populate_test_db.sql
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ INSERT INTO qiita.prep_1 (sample_id, BarcodeSequence, LIBRARY_CONSTRUCTION_PROTO
('SKM9.640192', 'AGCAGGCACGAA', 'This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.', 'GTGCCAGCMGCCGCGGTAA', 'V4', '16S rRNA', 'ANL', 's_G1_L001_sequences', '8/1/12', 'ANL', 'micro biome of soil and rhizosphere of cannabis plants from CA', 'Cannabis Soil Microbiome', 'Illumina', '.25,g', 'Sequencing by synthesis', 'MiSeq', 'ANL', 'FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT', 'CCME');

-- Insert preprocessed information for raw data 1
INSERT INTO qiita.preprocessed_data (preprocessed_params_table, preprocessed_params_id, submitted_to_insdc) VALUES ('preprocessed_sequence_illumina_params', 1, TRUE), ('preprocessed_sequence_illumina_params', 2, FALSE);
INSERT INTO qiita.preprocessed_data (preprocessed_params_table, preprocessed_params_id, submitted_to_insdc, data_type_id) VALUES ('preprocessed_sequence_illumina_params', 1, TRUE, 2), ('preprocessed_sequence_illumina_params', 2, FALSE, 2);

-- Link the new preprocessed data with the raw data
INSERT INTO qiita.raw_preprocessed_data (raw_data_id, preprocessed_data_id) VALUES (1, 1), (1, 2);
Expand All @@ -306,7 +306,7 @@ INSERT INTO qiita.preprocessed_filepath (preprocessed_data_id, filepath_id) VALU
INSERT INTO qiita.preprocessed_sequence_illumina_params (trim_length) VALUES (151), (100);

-- Insert processed information for study 0 and processed data 1
INSERT INTO qiita.processed_data (processed_params_table, processed_params_id, processed_date) VALUES ('processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012');
INSERT INTO qiita.processed_data (processed_params_table, processed_params_id, processed_date, data_type_id) VALUES ('processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012', 2);

-- Link the processed data with the preprocessed data
INSERT INTO qiita.preprocessed_processed_data (preprocessed_data_id, processed_data_id) VALUES (1, 1);
Expand Down
20 changes: 17 additions & 3 deletions qiita_db/support_files/qiita-db.dbs
Original file line number Diff line number Diff line change
Expand Up @@ -574,9 +574,16 @@ Linked by y being raw_data_id from raw data table.</comment>
</column>
<column name="preprocessed_params_id" type="bigint" jt="-5" mandatory="y" />
<column name="submitted_to_insdc" type="bool" jt="-7" mandatory="y" />
<column name="data_type_id" type="bigint" jt="-5" mandatory="y" />
<index name="pk_preprocessed_data" unique="PRIMARY_KEY" >
<column name="preprocessed_data_id" />
</index>
<index name="idx_preprocessed_data" unique="NORMAL" >
<column name="data_type_id" />
</index>
<fk name="fk_preprocessed_data" to_schema="qiita" to_table="data_type" >
<fk_column name="data_type_id" pk="data_type_id" />
</fk>
</table>
<table name="preprocessed_filepath" >
<column name="preprocessed_data_id" type="bigint" jt="-5" mandatory="y" />
Expand Down Expand Up @@ -660,9 +667,16 @@ Linked by y being raw_data_id from raw data table.</comment>
<comment><![CDATA[Link to a table with the parameters used to generate processed data]]></comment>
</column>
<column name="processed_date" type="timestamp" jt="93" mandatory="y" />
<column name="data_type_id" type="bigint" jt="-5" mandatory="y" />
<index name="pk_processed_data" unique="PRIMARY_KEY" >
<column name="processed_data_id" />
</index>
<index name="idx_processed_data" unique="NORMAL" >
<column name="data_type_id" />
</index>
<fk name="fk_processed_data" to_schema="qiita" to_table="data_type" >
<fk_column name="data_type_id" pk="data_type_id" />
</fk>
</table>
<table name="processed_filepath" >
<column name="processed_data_id" type="bigint" jt="-5" mandatory="y" />
Expand Down Expand Up @@ -1303,8 +1317,6 @@ Controlled Vocabulary]]></comment>
<entity schema="qiita" name="portal_type" color="c0d4f3" x="1845" y="720" />
<entity schema="qiita" name="raw_data" color="d0def5" x="1230" y="480" />
<entity schema="qiita" name="raw_preprocessed_data" color="b2cdf7" x="1230" y="585" />
<entity schema="qiita" name="preprocessed_filepath" color="c0d4f3" x="990" y="705" />
<entity schema="qiita" name="preprocessed_data" color="c0d4f3" x="1200" y="690" />
<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
<entity schema="qiita" name="command" color="d0def5" x="210" y="1095" />
<entity schema="qiita" name="logging" color="c0d4f3" x="1365" y="1200" />
Expand All @@ -1313,9 +1325,11 @@ Controlled Vocabulary]]></comment>
<entity schema="qiita" name="preprocessed_spectra_params" color="d0def5" x="1830" y="825" />
<entity schema="qiita" name="preprocessed_sequence_454_params" color="c0d4f3" x="1815" y="915" />
<entity schema="qiita" name="preprocessed_sequence_illumina_params" color="d0def5" x="1800" y="1005" />
<entity schema="qiita" name="processed_data" color="d0def5" x="1215" y="930" />
<entity schema="qiita" name="study_processed_data" color="b2cdf7" x="1455" y="930" />
<entity schema="qiita" name="command_data_type" color="c0d4f3" x="390" y="1155" />
<entity schema="qiita" name="preprocessed_data" color="c0d4f3" x="1200" y="690" />
<entity schema="qiita" name="processed_data" color="d0def5" x="1215" y="930" />
<entity schema="qiita" name="preprocessed_filepath" color="c0d4f3" x="990" y="690" />
<group name="Group_analyses" color="c4e0f9" >
<comment>analysis tables</comment>
<entity schema="qiita" name="analysis" />
Expand Down
Loading