Mccalluc/add fields to datasetmanager index (#1824)

* Add "generic" facets * RE is slightly more general, but that is probably what we want * May be gratuitous, but I think this makes it more clear that we begin from a stem and add modifiers * Checkpoint: New test fails, looking for a FileStoreItem. (Error is swallowed, and method returns None, which doesn't feel like the right behavior?) * Test passes, but TODO: add attributes that will exercise new logic * remap keys as they are not stable * remap values, too. Add tearDown * ... but do not remap uuids * Checkpoint: This should be adding an attribute... but the test still passes with no other changes, so not quite right
refinery-platform · Jun 26, 2017 · fb6d99b · fb6d99b
1 parent 77597a4
commit fb6d99b
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 44 deletions.
diff --git a/refinery/data_set_manager/search_indexes.py b/refinery/data_set_manager/search_indexes.py
@@ -5,15 +5,15 @@
 '''
 
 import logging
-import string
+import re
 
 from django.conf import settings
-
 from haystack import indexes
 
-from .models import AnnotatedNode, Node
 from file_store.models import FileStoreItem
 
+from .models import AnnotatedNode, Node
+
 logger = logging.getLogger(__name__)
 
 
@@ -60,33 +60,20 @@ def prepare(self, object):
             uuid += "_" + str(object.assay.id)
         # create dynamic fields for each attribute
         for annotation in annotations:
-            if annotation.attribute_subtype is None:
-                name = annotation.attribute_type
-            else:
-                name = annotation.attribute_subtype + "_" + \
-                       annotation.attribute_type
-            if annotation.attribute_value_unit is None:
-                value = annotation.attribute_value
-            else:
-                value = annotation.attribute_value + " " + \
-                        annotation.attribute_value_unit
-            # replace problematic characters in field names
-            name = string.replace(name, "/",
-                                  settings.REFINERY_SOLR_SPACE_DYNAMIC_FIELDS)
-            name = string.replace(name, "(",
-                                  settings.REFINERY_SOLR_SPACE_DYNAMIC_FIELDS)
-            name = string.replace(name, ")",
-                                  settings.REFINERY_SOLR_SPACE_DYNAMIC_FIELDS)
-            name = string.replace(name, "#",
-                                  settings.REFINERY_SOLR_SPACE_DYNAMIC_FIELDS)
-            name = string.replace(name, ",",
-                                  settings.REFINERY_SOLR_SPACE_DYNAMIC_FIELDS)
-            name = string.replace(name, " ",
-                                  settings.REFINERY_SOLR_SPACE_DYNAMIC_FIELDS)
-            name = string.replace(name, "'",
-                                  settings.REFINERY_SOLR_SPACE_DYNAMIC_FIELDS)
-
-            key = name + "_" + uuid + "_s"
+            name = annotation.attribute_type
+            if annotation.attribute_subtype is not None:
+                name = annotation.attribute_subtype + "_" + name
+
+            value = annotation.attribute_value
+            if annotation.attribute_value_unit is not None:
+                value += " " + annotation.attribute_value_unit
+
+            name = re.sub(r'\W',
+                          settings.REFINERY_SOLR_SPACE_DYNAMIC_FIELDS,
+                          name)
+
+            uniq_key = name + "_" + uuid + "_s"
+            generic_key = name + "_generic_s"
             # a node might have multiple parents with different attribute
             # values for a given attribute
             # e.g. parentA Characteristic[cell type] = K562 and
@@ -95,13 +82,15 @@ def prepare(self, object):
             # concatenation of the unique list
             # old version (only one attribute kept):
             # data[key] = value
-            if key not in data:
-                data[key] = set()
-            if value != "":
-                data[key].add(value)
-            else:
-                data[key].add("N/A")
+            for key in (uniq_key, generic_key):
+                if key not in data:
+                    data[key] = set()
+                if value != "":
+                    data[key].add(value)
+                else:
+                    data[key].add("N/A")
         # iterate over all keys in data and join sets into strings
+        # TODO: This doesn't feel right: facet each separately?
         for key, value in data.iteritems():
             if type(value) is set:
                 data[key] = " + ".join(value)

diff --git a/refinery/data_set_manager/tests.py b/refinery/data_set_manager/tests.py
@@ -1,18 +1,21 @@
 import json
-from StringIO import StringIO
-
 import re
+from StringIO import StringIO
 
 from django.contrib.auth.models import User
 from django.core.files.uploadedfile import (InMemoryUploadedFile,
                                             SimpleUploadedFile)
-
 from django.http import QueryDict
 from django.test import TestCase
-
 from rest_framework.test import APIClient, APIRequestFactory, APITestCase
 
-from .models import Assay, AttributeOrder, Study, Investigation, Node
+from core.models import DataSet, ExtendedGroup, InvestigationLink
+from core.views import NodeViewSet
+from file_store.models import FileStoreItem
+
+from .models import (Assay, Attribute, AttributeOrder, Investigation, Node,
+                     Study)
+from .search_indexes import NodeIndex
 from .serializers import AttributeOrderSerializer
 from .utils import (create_facet_filter_query, customize_attribute_response,
                     escape_character_solr, format_solr_response,
@@ -24,9 +27,6 @@
                     insert_facet_field_filter, is_field_in_hidden_list,
                     objectify_facet_field_counts, update_attribute_order_ranks)
 from .views import Assays, AssaysAttributes
-from core.models import DataSet, ExtendedGroup, InvestigationLink
-from core.views import NodeViewSet
-from file_store.models import FileStoreItem
 
 
 class AssaysAPITests(APITestCase):
@@ -1572,3 +1572,80 @@ def test_get_basic_node(self):
         self.assertTrue('subanalysis' in self.get_response.data[0])
         self.assertTrue('type' in self.get_response.data[0])
         self.assertTrue('uuid' in self.get_response.data[0])
+
+
+class NodeIndexTests(APITestCase):
+
+    def setUp(self):
+        investigation = Investigation.objects.create()
+        study = Study.objects.create(investigation=investigation)
+        assay = Assay.objects.create(study=study)
+
+        test_file = StringIO()
+        test_file.write('Coffee is great.\n')
+        file_store_item = FileStoreItem.objects.create(
+            datafile=InMemoryUploadedFile(
+                test_file,
+                field_name='tempfile',
+                name='test_file.txt',
+                content_type='text/plain',
+                size=len(test_file.getvalue()),
+                charset='utf-8'
+            )
+        )
+
+        self.node = Node.objects.create(
+            assay=assay,
+            study=study,
+            file_uuid=file_store_item.uuid)
+
+        self.assay_uuid = assay.uuid
+        self.study_uuid = study.uuid
+        self.file_uuid = file_store_item.uuid
+        self.node_uuid = self.node.uuid
+
+        Attribute.objects.create(
+            node=self.node,
+            type=Attribute.CHARACTERISTICS,
+            subtype='fake subtype',
+            value='fake value'
+        )
+
+        self.maxDiff = None
+
+    def tearDown(self):
+        FileStoreItem.objects.all().delete()
+
+    def test_prepare(self):
+        data = NodeIndex().prepare(self.node)
+        data = dict(
+            (
+                re.sub(r'\d+', '#', k),
+                re.sub(r'\d+', '#', v) if
+                type(v) in (unicode, str) and not('uuid' in k)
+                else v
+            )
+            for (k, v) in data.items())
+        self.assertEqual(data,
+                         {'REFINERY_ANALYSIS_UUID_#_#_s': 'N/A',
+                          'REFINERY_FILETYPE_#_#_s': None,
+                          'REFINERY_NAME_#_#_s': u'',
+                          'REFINERY_SUBANALYSIS_#_#_s': -1,
+                          'REFINERY_TYPE_#_#_s': u'',
+                          'REFINERY_WORKFLOW_OUTPUT_#_#_s': 'N/A',
+                          'analysis_uuid': None,
+                          'assay_uuid': self.assay_uuid,
+                          u'django_ct': u'data_set_manager.node',
+                          u'django_id': u'#',
+                          'file_uuid': self.file_uuid,
+                          'genome_build': None,
+                          u'id': u'data_set_manager.node.#',
+                          'is_annotation': False,
+                          'name': u'',
+                          'species': None,
+                          'study_uuid': self.study_uuid,
+                          'subanalysis': None,
+                          'text': u'',
+                          'type': u'',
+                          'uuid': self.node_uuid,
+                          'workflow_output': None})