-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor purge function #837
Changes from 6 commits
e16168e
b10d40e
26bb72d
f557296
1095cc9
4ca81fd
3a58a90
66bc2e4
6fd04c5
9b230a2
155a9d3
002c3a9
373f7ff
dba6d64
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
from qiita_db.exceptions import QiitaDBColumnError, QiitaDBError | ||
from qiita_db.data import RawData | ||
from qiita_db.study import Study | ||
from qiita_db.reference import Reference | ||
from qiita_db.util import (exists_table, exists_dynamic_table, scrub_data, | ||
compute_checksum, check_table_cols, | ||
check_required_columns, convert_to_id, | ||
|
@@ -421,6 +422,68 @@ def test_purge_filepaths(self): | |
self.assertTrue(exists(fp1)) | ||
self.assertFalse(exists(fp_exp_id)) | ||
|
||
def test_purge_filepaths_null_cols(self): | ||
"""Tests the nulls vs not in issue in purge_filepaths""" | ||
# For more details about the issue: | ||
# http://www.depesz.com/2008/08/13/nulls-vs-not-in/ | ||
# In the current set up, the only place where we can actually have a | ||
# null value in a filepath id is in the reference table. Add a new | ||
# reference without tree and taxonomy: | ||
fd, seqs_fp = mkstemp(suffix="_seqs.fna") | ||
close(fd) | ||
ref = Reference.create("null_db", "13_2", seqs_fp) | ||
self.files_to_remove.append(ref.sequence_fp) | ||
# Add a new filepath to the database | ||
fd, fp = mkstemp() | ||
close(fd) | ||
exp_id = 1 + self.conn_handler.execute_fetchone( | ||
"SELECT count(1) FROM qiita.filepath")[0] | ||
fp_id = self.conn_handler.execute_fetchone( | ||
"INSERT INTO qiita.filepath " | ||
"(filepath, filepath_type_id, checksum, checksum_algorithm_id) " | ||
"VALUES (%s, %s, %s, %s) RETURNING filepath_id", (fp, 1, "", 1))[0] | ||
self.assertEqual(fp_id, exp_id) | ||
|
||
# Connect the just added filepath to a raw data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. connect -> link There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
self.conn_handler.execute( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we do these with actual API calls or is this really the only way to do it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've changed it for a couple of API calls (and fixed them in the previous test as it was a copy-paste...) |
||
"INSERT INTO qiita.raw_filepath (raw_data_id, filepath_id) VALUES" | ||
"(%s, %s)", (1, exp_id)) | ||
|
||
# Get the filepaths so we can test if they've been removed or not | ||
sql_fp = "SELECT filepath FROM qiita.filepath WHERE filepath_id=%s" | ||
fp1 = self.conn_handler.execute_fetchone(sql_fp, (1,))[0] | ||
fp1 = join(get_db_files_base_dir(), fp1) | ||
|
||
# Make sure that the file exists - specially for travis | ||
with open(fp1, 'w') as f: | ||
f.write('\n') | ||
|
||
fp_exp_id = self.conn_handler.execute_fetchone(sql_fp, (exp_id,))[0] | ||
fp_exp_id = join(get_db_files_base_dir(), fp_exp_id) | ||
|
||
purge_filepaths(self.conn_handler) | ||
|
||
# Check that the files still exist | ||
self.assertTrue(exists(fp1)) | ||
self.assertTrue(exists(fp_exp_id)) | ||
|
||
# Unlink the filepath from the raw data | ||
self.conn_handler.execute( | ||
"DELETE FROM qiita.raw_filepath WHERE filepath_id=%s", (exp_id,)) | ||
|
||
# Only filepath 16 should be removed | ||
sql = ("SELECT filepath_id FROM qiita.filepath ORDER BY " | ||
"filepath_id") | ||
exp_ids = [i[0] for i in self.conn_handler.execute_fetchall(sql)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor but these list comprehensions could be replaced with skbio.util.flatten There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good call. Done! |
||
exp_ids.pop() | ||
purge_filepaths(self.conn_handler) | ||
obs_ids = [i[0] for i in self.conn_handler.execute_fetchall(sql)] | ||
self.assertEqual(obs_ids, exp_ids) | ||
|
||
# Check that only the file for the removed filepath has been removed | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should these tests also check that all the other previously existing file paths still exist? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are testing that only one of the other ones still exists. Fun fact: the filepaths that are present in the database, they actually do not exist. Which means that if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm if nobody else feels like this should be explicitly tested, then On (Feb-13-15|13:22), josenavas wrote:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea of the test framework is that before and after it always returns to the same state, right? So testing that this method erases those not used filepaths is not only safe but also a good exercise, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I too feel that way, or rather, I think it is only correct to verify On (Feb-13-15|13:44), Antonio Gonzalez wrote:
|
||
self.assertTrue(exists(fp1)) | ||
self.assertFalse(exists(fp_exp_id)) | ||
|
||
def test_move_filepaths_to_upload_folder(self): | ||
# setting up test, done here as this is the only test that uses these | ||
# files | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -697,26 +697,36 @@ def purge_filepaths(conn_handler=None): | |
""" | ||
conn_handler = conn_handler if conn_handler else SQLConnectionHandler() | ||
|
||
# Get all the (table, column) pairs that reference to the filepath table | ||
# Code adapted from http://stackoverflow.com/q/5347050/3746629 | ||
table_cols_pairs = conn_handler.execute_fetchall( | ||
"""SELECT R.TABLE_NAME, R.column_name | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ...at first i was afraid then I was petrified... This is pretty cool, good find! Following on the heels of one of @ElDeveloper prior comments on a different PR, would it be possible to cite the permanent URL for the SO post? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks! On Wed, Feb 11, 2015 at 2:22 PM, josenavas notifications@github.com wrote:
|
||
FROM INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE u | ||
INNER JOIN INFORMATION_SCHEMA.REFERENTIAL_CONSTRAINTS FK | ||
ON U.CONSTRAINT_CATALOG = FK.UNIQUE_CONSTRAINT_CATALOG | ||
AND U.CONSTRAINT_SCHEMA = FK.UNIQUE_CONSTRAINT_SCHEMA | ||
AND U.CONSTRAINT_NAME = FK.UNIQUE_CONSTRAINT_NAME | ||
INNER JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE R | ||
ON R.CONSTRAINT_CATALOG = FK.CONSTRAINT_CATALOG | ||
AND R.CONSTRAINT_SCHEMA = FK.CONSTRAINT_SCHEMA | ||
AND R.CONSTRAINT_NAME = FK.CONSTRAINT_NAME | ||
WHERE U.COLUMN_NAME = 'filepath_id' | ||
AND U.TABLE_SCHEMA = 'qiita' | ||
AND U.TABLE_NAME = 'filepath'""") | ||
|
||
union_str = " UNION ".join( | ||
["SELECT %s FROM qiita.%s WHERE %s IS NOT NULL" % (col, table, col) | ||
for table, col in table_cols_pairs]) | ||
# Get all the filepaths from the filepath table that are not | ||
# referenced from any place in the database | ||
fps = conn_handler.execute_fetchall( | ||
"""SELECT filepath_id, filepath, filepath_type FROM qiita.filepath | ||
FP JOIN qiita.filepath_type FPT ON | ||
FP.filepath_type_id = FPT.filepath_type_id | ||
WHERE filepath_id NOT IN ( | ||
SELECT filepath_id FROM qiita.raw_filepath UNION | ||
SELECT filepath_id FROM qiita.preprocessed_filepath UNION | ||
SELECT filepath_id FROM qiita.processed_filepath UNION | ||
SELECT filepath_id FROM qiita.job_results_filepath UNION | ||
SELECT filepath_id FROM qiita.analysis_filepath UNION | ||
SELECT sequence_filepath FROM qiita.reference UNION | ||
SELECT taxonomy_filepath FROM qiita.reference UNION | ||
SELECT tree_filepath FROM qiita.reference)""") | ||
WHERE filepath_id NOT IN (%s)""" % union_str) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this also make sure the files are legitimately deleted off the filesystem at the time of database removal? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines 724-739 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, sorry, not part of request so didn't see it. Carry on. |
||
|
||
# We can now go over and remove all the filepaths | ||
for fp_id, fp, fp_type in fps: | ||
conn_handler.execute("DELETE FROM qiita.sample_template_filepath " | ||
"WHERE filepath_id=%s", (fp_id,)) | ||
conn_handler.execute("DELETE FROM qiita.filepath WHERE filepath_id=%s", | ||
(fp_id,)) | ||
|
||
|
@@ -747,8 +757,6 @@ def move_filepaths_to_upload_folder(study_id, filepaths, conn_handler=None): | |
|
||
# We can now go over and remove all the filepaths | ||
for fp_id, fp, _ in filepaths: | ||
conn_handler.execute("DELETE FROM qiita.sample_template_filepath " | ||
"WHERE filepath_id=%s", (fp_id,)) | ||
conn_handler.execute("DELETE FROM qiita.filepath WHERE filepath_id=%s", | ||
(fp_id,)) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does this docstring mean? ...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is an issue using the SQL command
NOT IN
if the list contains null values. There is a link in line 428 with more information. It was for clarity, but I think I did not achieve that; so I'm going to remove the docstring and leave only the comment with the linkThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ahhh, now I understand the phrasing of the docstring. Thanks!
On (Feb-13-15|13:07), josenavas wrote: