Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make data file names POSIX compliant #2732

Merged
merged 1 commit into from
Apr 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 9 additions & 9 deletions refinery/file_store/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import logging
import os
import re
import string
import urlparse

from django.conf import settings
Expand Down Expand Up @@ -47,15 +48,11 @@ def _mkdir(path):

def file_path(instance, filename):
"""Construct relative file system path for new file store files relative to
FILE_STORE_BASE_DIR.
FILE_STORE_BASE_DIR
instance: FileStoreItem instance
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't look like instance ever gets used here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. However, file_path is passed to upload_to as a callable.
"This callable must be able to accept two arguments"
https://docs.djangoproject.com/en/1.8/ref/models/fields/#django.db.models.FileField.upload_to

filename: requested filename
Based on
http://michaelandrews.typepad.com/the_technical_times/2009/10/creating-a-hashed-directory-structure.html

:param instance: FileStoreItem instance.
:type instance: FileStoreItem.
:param filename: requested filename.
:type filename: str.
:returns: str -- if success, None if failure.
"""
hashcode = hash(filename)
mask = 255 # bitmask
Expand All @@ -64,8 +61,11 @@ def file_path(instance, filename):
# provides 256 * 256 = 65536 of possible directory combinations
dir1 = "{:0>2x}".format(hashcode & mask)
dir2 = "{:0>2x}".format((hashcode >> 8) & mask)
# Galaxy doesn't process names with parentheses in them
filename = re.sub('[()]', '_', filename)
# make file name compliant with POSIX "fully portable filenames"
# max length: 255, allowed character set: [A-Za-z0-9._-]
# remove leading '-' characters to make file management easier
filename = re.sub('[^A-Za-z0-9._-]', '_',
string.lstrip(filename, '-'))[-255:]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the idea that the last part of the filename has the extension, and you definitely want to preserve that? Hopefully we never get files this long, but for people, the start of the name is also significant... it'd be more complicated, and probably not worth it, but I could imagine removing the part of the name just before the extension.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think most file systems in use today have a limit of 255 on file name length which means there should really be no files that are above that. So this is more of a last line of defense for which simple handling is sufficient.

return os.path.join(dir1, dir2, filename)


Expand Down
50 changes: 39 additions & 11 deletions refinery/file_store/test_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
import random
import string
from urlparse import urljoin
import uuid

Expand All @@ -18,17 +20,6 @@

class FileStoreModuleTest(TestCase):

def test_file_path(self):
path = file_path(FileStoreItem(), 'test.fastq')
self.assertIn('test.fastq', path)

def test_file_path_underscore_replacement(self):
filename = 'Kc.dMi-2(Q4443).wig_5.tdf'
new_filename = 'Kc.dMi-2_Q4443_.wig_5.tdf'
path = file_path(FileStoreItem(), filename)
self.assertIn(new_filename, path)
self.assertNotIn(filename, path)

def test_get_temp_dir(self):
self.assertEqual(get_temp_dir(), settings.FILE_STORE_TEMP_DIR)

Expand Down Expand Up @@ -84,6 +75,43 @@ def test_file_source_map(self):
'/example/path/test_file.dat')


class FilePathTest(TestCase):

def test_file_path_format(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe file_path needs a FileStoreItem instance in any of these tests

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The signature of file_path requires a FileStoreItem instance.

path = file_path(FileStoreItem(), 'test.fastq')
self.assertEqual(path, '2f/e3/test.fastq')

def test_leading_dash_removal(self):
path = file_path(FileStoreItem(), '-test.fastq')
self.assertEqual(path, '5b/85/test.fastq')

def test_multiple_leading_dash_removal(self):
path = file_path(FileStoreItem(), '--test.fastq')
self.assertEqual(path, '7b/20/test.fastq')

def test_max_name_length(self):
name = ''.join(random.choice(string.ascii_letters) for _ in range(256))
path = file_path(FileStoreItem(), name)
self.assertEqual(len(os.path.basename(path)), 255)

def test_remove_extra_leading_characters_over_max_length(self):
name = ''.join(random.choice(string.ascii_letters) for _ in range(256))
path = file_path(FileStoreItem(), name)
self.assertEqual(os.path.basename(path), name[-255:])

def test_forward_slash_replacement(self):
path = file_path(FileStoreItem(), 'Wig/BedGraph-to-bigWig.bigwig')
self.assertIn('Wig_BedGraph-to-bigWig.bigwig', path)

def test_space_character_replacement(self):
path = file_path(FileStoreItem(), 'test file.fastq')
self.assertIn('test_file.fastq', path)

def test_parentheses_replacement(self):
path = file_path(FileStoreItem(), 'Kc.dMi-2(Q4443).wig_5.tdf')
self.assertIn('Kc.dMi-2_Q4443_.wig_5.tdf', path)


class FileStoreItemTest(TestCase):

def setUp(self):
Expand Down