Implement datasets through unions and filters over specified graphs b…

…y re-writing the query. Still fails dataset-012b
ruby-rdf · Feb 7, 2013 · 6034b57 · 6034b57
1 parent 422cadf
commit 6034b57
Show file tree

Hide file tree

Showing 9 changed files with 259 additions and 39 deletions.
diff --git a/Gemfile b/Gemfile
@@ -9,6 +9,7 @@ gem "rdf-xsd",        :git => "git://github.com/ruby-rdf/rdf-xsd.git"
 
 group :development do
   gem 'shotgun'
+  gem 'redcarpet'
 end
 
 group :debug do

diff --git a/examples/issue3.rb b/examples/issue3.rb
@@ -19,7 +19,6 @@ def query(pattern, &block)
         :predicate => RDF::URI.new('http://localhost/attribute_types/last_name'),
         :object    => RDF::Literal.new('smith'))
     elsif pattern[:predicate].path == '/attribute_types/middle_name'
-      require 'debugger'; debugger
       statements << RDF::Statement.new(
         :subject   => RDF::URI.new('http://localhost/people/2'),
         :predicate => RDF::URI.new('http://localhost/attribute_types/middle_name'),

diff --git a/lib/sparql/algebra/extensions.rb b/lib/sparql/algebra/extensions.rb
@@ -107,6 +107,13 @@ class RDF::Query
   def ==(other)
     other.is_a?(RDF::Query) && patterns == other.patterns && context == context
   end
+
+  ##
+  # Don't do any more rewriting
+  # @return [SPARQL::Algebra::Expression] `self`
+  def rewrite(&block)
+    self
+  end
 
   # Transform Query into an Array form of an SSE
   #

diff --git a/lib/sparql/algebra/operator.rb b/lib/sparql/algebra/operator.rb
@@ -322,6 +322,26 @@ def optimize
       end
     end
 
+    ##
+    # Rewrite operands by yielding each operand. Recursively descends
+    # through operands implementing this method.
+    #
+    # @yield operand
+    # @yieldparam [] operand
+    # @yieldreturn [SPARQL::Algebra::Expression] the re-written operand
+    # @return [SPARQL::Algebra::Expression] `self`
+    def rewrite(&block)
+      @operands = @operands.map do |op|
+        # Rewrite the operand
+        unless new_op = block.call(op)
+          # Not re-written, rewrite
+          new_op = op.respond_to?(:rewrite) ? op.rewrite(&block) : op
+        end
+        new_op
+      end
+      self
+    end
+
     ##
     # Returns the SPARQL S-Expression (SSE) representation of this operator.
     #
@@ -362,6 +382,7 @@ def eql?(other)
       other.class == self.class && other.operands == self.operands
     end
     alias_method :==, :eql?
+
   protected
 
     ##
@@ -401,6 +422,27 @@ def boolean(literal)
       end
     end
 
+    ##
+    # Transform an array of expressions into a recursive set
+    # of binary operations
+    # e.g.: a || b || c => (|| a (|| b c))
+    # @param [Class] Binary Operator class
+    # @param [Array<SPARQL::Algebra::Expression>] *expressions
+    # @return [SPARQL::Algebra::Expression]
+    def to_binary(klass, *expressions)
+      case expressions.length
+      when 0
+        # Oops!
+        raise "Operator#to_binary requires two or more expressions"
+      when 1
+        expressions.first
+      when 2
+        klass.new(*expressions)
+      else
+        klass.new(expressions.shift, to_binary(klass, *expressions))
+      end
+    end
+
   private
 
     @@subclasses = [] # @private

diff --git a/lib/sparql/algebra/operator/dataset.rb b/lib/sparql/algebra/operator/dataset.rb
@@ -15,8 +15,8 @@ class Operator
     # either bare, indicating a default dataset, or expressed as an array `\[:named, \<uri\>\]`,
     # indicating that it represents a named data source.
     #
-    # This operator loads the document referenced by the URI into the dataset
-    # using `uri` as the graph name, unless it already exists within the dataset.
+    # This operator loads from the datasource, unless a graph named by
+    # the datasource URI already exists in the repository.
     #
     # The contained BGP queries are then performed against the specified
     # default and named graphs. Rather than using the actual default
@@ -25,23 +25,84 @@ class Operator
     # and the results are filtered against those URIs included in
     # the default dataset.
     #
-    # @example
+    # Specifically, each BGP which is not part of a graph pattern
+    # is replaced with a union of graph patterns with that BGP repeated
+    # for each graph URI in the default dataset. This requires recursively
+    # updating the operator.
+    #
+    # Each graph pattern containing a variable graph name is replaced
+    # by a filter on that variable such that the variable must match
+    # only those named datasets specified.
+    #
+    # @example Dataset with one default and one named data source
     #
     #     (prefix ((: <http://example/>))
     #       (dataset (<data-g1.ttl> (named <data-g2.ttl>))
     #         (union
     #           (bgp (triple ?s ?p ?o))
     #           (graph ?g (bgp (triple ?s ?p ?o))))))
     #
-    # is effectively re-written to the following:
+    #     is effectively re-written to the following:
     #
     #     (prefix ((: <http://example/>))
-    #       (dataset (<data-g1.ttl> (named <data-g2.ttl>))
-    #         (filter (= ??g <data-g1.ttl>)
-    #           (union
-    #             (graph ??g (bgp (triple ?s ?p ?o)))
-    #             (graph ?g (bgp (triple ?s ?p ?o)))))))
+    #       (union
+    #         (graph <data-g1.ttl> (bgp (triple ?s ?p ?o)))
+    #         (filter (= ?g <data-g2.ttl>)
+    #           (graph ?g (bgp (triple ?s ?p ?o))))))
+    #
+    # If no default or no named graphs are specified, these queries
+    # are eliminated.
+    #
+    # @example Dataset with one default no named data sources
+    #
+    #     (prefix ((: <http://example/>))
+    #       (dataset (<data-g1.ttl>)
+    #         (union
+    #           (bgp (triple ?s ?p ?o))
+    #           (graph ?g (bgp (triple ?s ?p ?o))))))
+    #
+    #     is effectively re-written to the following:
+    #
+    #     (prefix ((: <http://example/>))
+    #       (union
+    #         (graph <data-g1.ttl> (bgp (triple ?s ?p ?o)))
+    #         (bgp))
+    #
+    # Multiple default graphs union the information from a graph query
+    # on each default datasource.
+    #
+    # @example Dataset with two default data sources
+    #
+    #     (prefix ((: <http://example/>))
+    #       (dataset (<data-g1.ttl> <data-g1.ttl)
+    #         (bgp (triple ?s ?p ?o))))
     #
+    #     is effectively re-written to the following:
+    #
+    #     (prefix ((: <http://example/>))
+    #       (union
+    #         (graph <data-g1.ttl> (bgp (triple ?s ?p ?o)))
+    #         (graph <data-g2.ttl> (bgp (triple ?s ?p ?o)))))
+    #
+    # Multiple named graphs place a filter on all variables used
+    # to identify those named graphs so that they are restricted
+    # to come only from the specified set. Note that this requires
+    # descending through expressions to find graph patterns using
+    # variables and placing a filter on each identified variable.
+    #
+    # @example Dataset with two named data sources
+    #
+    #     (prefix ((: <http://example/>))
+    #       (dataset ((named <data-g1.ttl>) (named <data-g2.ttl>))
+    #         (graph ?g (bgp (triple ?s ?p ?o)))))
+    #
+    #     is effectively re-written to the following:
+    #
+    #     (prefix ((: <http://example/>))
+    #       (filter ((= ?g <data-g1.ttl>) || (= ?g <data-g2.ttl>))
+    #         (graph ?g (bgp (triple ?s ?p ?o))))))
+    #
+    # @example Dataset with multiple named graphs
     # @see http://www.w3.org/TR/rdf-sparql-query/#specifyingDataset
     class Dataset < Binary
       include Query
@@ -75,7 +136,8 @@ class Dataset < Binary
       # @see    http://www.w3.org/TR/rdf-sparql-query/#sparqlAlgebra
       def execute(queryable, options = {})
         debug(options) {"Dataset"}
-        default_graphs = []
+        default_datasets = []
+        named_datasets = []
         operand(0).each do |ds|
           load_opts = {
             :headers => {"Accept" => ACCEPTS}
@@ -94,11 +156,12 @@ def execute(queryable, options = {})
             uri = self.base_uri ? self.base_uri.join(ds.last) : ds.last
             uri.lexical = ds.last
             debug(options) {"=> named data source #{uri}"}
+            named_datasets << uri
           else
             debug(options) {"=> array: join #{self.base_uri.inspect} to #{ds.inspect}"}
             uri = self.base_uri ? self.base_uri.join(ds) : ds
             debug(options) {"=> default data source #{uri}"}
-            default_graphs << uri
+            default_datasets << uri
           end
           load_opts[:context] = load_opts[:base_uri] = uri
           unless queryable.has_context?(uri)
@@ -107,27 +170,51 @@ def execute(queryable, options = {})
           end
         end
 
-        # Query binding a non-distinguishded variable to context
-        default_var = RDF::Query::Variable.new
-        default_var.distinguished = false
-
-        @solutions = operands.last.execute(queryable, options.merge(
-          :context => default_var,
-          :depth => options[:depth].to_i + 1)
-        ).filter do |soln|
-          # Reject solutions with bindings to default_var where the value
-          # is not a specified default graph
-          debug(options) {"=> filter: #{soln.inspect}"}
-          if soln.unbound?(default_var)
-            true
-          elsif default_graphs.include?(soln[default_var])
-            # Remove the variable from the solution and match
-            # FIXME: this should either go in RDF::Query::Solution,
-            # or there should be a immutable way of performing this
-            # as an operation on RDF::Query::Solutions
-            soln.bindings.delete(default_var.to_sym)
+        # Re-write the operand:
+        #require 'debugger'; breakpoint
+        operator = self.rewrite do |op|
+          case op
+          when Operator::Graph
+            if named_datasets.empty?
+              # * If there are no named datasets, remove all (graph)
+              #   operations.
+              debug(options) {"=> #{op.to_sxp} => (bgp)"}
+              Operator::BGP.new
+            elsif (name = op.operand(0)).is_a?(RDF::Resource)
+              # It must match one of the named_datasets
+              debug(options) {"=> #{op.to_sxp} => (bgp)"}
+              named_datasets.include?(name) ? op : Operator::BGP.new
+            else
+              # Name is a variable, replace op with a filter on that
+              # variable and op
+              filter_expressions = named_datasets.map {|u| Operator::Equal.new(name, u)}
+              debug(options) {"=> #{op.to_sxp} => (filter (...) #{op.to_sxp})"}
+              filt = to_binary(Operator::Or, *filter_expressions)
+              Operator::Filter.new(filt, op)
+            end
+          when RDF::Query # Operator::BGP
+            case default_datasets.length
+            when 0
+              # No Default Datasets, no query to run
+              debug(options) {"=> #{op.to_sxp} => (bgp)"}
+              Operator::BGP.new
+            when 1
+              # A single dataset, write as (graph <dataset> (bgp))
+              debug(options) {"=> #{op.to_sxp} => (graph <#{default_datasets.first}> #{op.to_sxp})"}
+              Operator::Graph.new(default_datasets.first, op)
+            else
+              # Several, rewrite as Union
+              debug(options) {"=> #{op.to_sxp} => (union ...)"}
+              to_binary(Operator::Union, *default_datasets.map {|u| Operator::Graph.new(u, op)})
+            end
+          else
+            nil
           end
         end
+        executable = operator.operands.last
+        debug(options) {"=> rewritten: #{executable.to_sxp}"}
+
+        @solutions = executable.execute(queryable, options.merge(:depth => options[:depth].to_i + 1))
       end
 
       ##

diff --git a/lib/sparql/algebra/operator/filter.rb b/lib/sparql/algebra/operator/filter.rb
@@ -33,7 +33,7 @@ class Filter < Operator::Binary
       # @see    http://www.w3.org/TR/rdf-sparql-query/#sparqlAlgebra
       # @see    http://www.w3.org/TR/rdf-sparql-query/#ebv
       def execute(queryable, options = {})
-        debug(options) {"Filter #{operands.first}"}
+        debug(options) {"Filter #{operands.first.to_sxp}"}
         @solutions = operands.last.execute(queryable, options.merge(:depth => options[:depth].to_i + 1))
         debug(options) {"=>(before) #{@solutions.map(&:to_hash).inspect}"}
         @solutions = @solutions.filter do |solution|

diff --git a/lib/sparql/algebra/operator/graph.rb b/lib/sparql/algebra/operator/graph.rb
@@ -45,6 +45,13 @@ def execute(queryable, options = {})
       def optimize
         operands = operands.map(&:optimize)
       end
+
+      ##
+      # Don't do any more rewriting
+      # @return [SPARQL::Algebra::Expression] `self`
+      def rewrite(&block)
+        self
+      end
     end # Graph
   end # Operator
 end; end # SPARQL::Algebra
diff --git a/lib/sparql/algebra/operator/union.rb b/lib/sparql/algebra/operator/union.rb
@@ -29,20 +29,19 @@ class Union < Operator::Binary
       # @see    http://www.w3.org/TR/rdf-sparql-query/#sparqlAlgebra
       def execute(queryable, options = {})
         debug(options) {"Union"}
-        solutions1 = operand(0).execute(queryable, options.merge(:depth => options[:depth].to_i + 1))
-        debug(options) {"=>(left) #{solutions1.inspect}"}
-        solutions2 = operand(1).execute(queryable, options.merge(:depth => options[:depth].to_i + 1))
-        debug(options) {"=>(right) #{solutions2.inspect}"}
-        @solutions = RDF::Query::Solutions.new(solutions1 + solutions2)
+        @solutions = RDF::Query::Solutions.new(operands.inject([]) do |memo, op|
+          solns = op.execute(queryable, options.merge(:depth => options[:depth].to_i + 1))
+          debug(options) {"=> (op) #{solns.inspect}"}
+          memo + solns
+        end)
         debug(options) {"=> #{@solutions.inspect}"}
         @solutions
       end
 
       ##
       # Returns an optimized version of this query.
       #
-      # If optimize operands, and if the first two operands are both Queries, replace
-      # with the unique sum of the query elements
+      # Optimize operands and remove any which are empty.
       #
       # @return [Union, RDF::Query] `self`
       def optimize