diff --git a/README.md b/README.md
index 2e44c2c..477ea38 100644
--- a/README.md
+++ b/README.md
@@ -1,43 +1,43 @@
-# arrow 12.0.1-1
+# arrow 13.0.0-1
 
- - mingw-w64-i686-arrow-12.0.1-1-any.pkg.tar.xz
+ - mingw-w64-i686-arrow-13.0.0-1-any.pkg.tar.xz
  - mingw-w64-i686-aws-sdk-cpp-1.7.365-1-any.pkg.tar.xz
  - mingw-w64-i686-brotli-1.0.9-4-any.pkg.tar.xz
- - mingw-w64-i686-openssl-1.1.1.m-9800-any.pkg.tar.xz
+ - mingw-w64-i686-openssl-3.1.1-9800-any.pkg.tar.xz
  - mingw-w64-i686-lz4-1.8.2-1-any.pkg.tar.xz
  - mingw-w64-i686-re2-20200801-1-any.pkg.tar.xz
  - mingw-w64-i686-snappy-1.1.7-2-any.pkg.tar.xz
  - mingw-w64-i686-bzip2-1.0.8-1-any.pkg.tar.xz
- - mingw-w64-i686-curl-7.84.0-9000-any.pkg.tar.xz
- - mingw-w64-i686-libssh2-1.11.0-9800-any.pkg.tar.xz
+ - mingw-w64-i686-curl-8.1.2-9000-any.pkg.tar.xz
+ - mingw-w64-i686-libssh2-1.11.0-9801-any.pkg.tar.xz
  - mingw-w64-i686-thrift-0.13.0-1-any.pkg.tar.xz
  - mingw-w64-i686-zstd-1.4.4-1-any.pkg.tar.xz
  - mingw-w64-i686-libutf8proc-2.4.0-2-any.pkg.tar.xz
  - mingw-w64-i686-nghttp2-1.51.0-1-any.pkg.tar.xz
- - mingw-w64-x86_64-arrow-12.0.1-1-any.pkg.tar.xz
+ - mingw-w64-x86_64-arrow-13.0.0-1-any.pkg.tar.xz
  - mingw-w64-x86_64-aws-sdk-cpp-1.7.365-1-any.pkg.tar.xz
  - mingw-w64-x86_64-brotli-1.0.9-4-any.pkg.tar.xz
- - mingw-w64-x86_64-openssl-1.1.1.m-9800-any.pkg.tar.xz
+ - mingw-w64-x86_64-openssl-3.1.1-9800-any.pkg.tar.xz
  - mingw-w64-x86_64-lz4-1.8.2-1-any.pkg.tar.xz
  - mingw-w64-x86_64-re2-20200801-1-any.pkg.tar.xz
  - mingw-w64-x86_64-snappy-1.1.7-2-any.pkg.tar.xz
  - mingw-w64-x86_64-bzip2-1.0.8-1-any.pkg.tar.xz
- - mingw-w64-x86_64-curl-7.84.0-9000-any.pkg.tar.xz
- - mingw-w64-x86_64-libssh2-1.11.0-9800-any.pkg.tar.xz
+ - mingw-w64-x86_64-curl-8.1.2-9000-any.pkg.tar.xz
+ - mingw-w64-x86_64-libssh2-1.11.0-9801-any.pkg.tar.xz
  - mingw-w64-x86_64-thrift-0.13.0-1-any.pkg.tar.xz
  - mingw-w64-x86_64-zstd-1.4.4-1-any.pkg.tar.xz
  - mingw-w64-x86_64-libutf8proc-2.4.0-2-any.pkg.tar.xz
  - mingw-w64-x86_64-nghttp2-1.51.0-1-any.pkg.tar.xz
- - mingw-w64-ucrt-x86_64-arrow-12.0.1-1-any.pkg.tar.xz
+ - mingw-w64-ucrt-x86_64-arrow-13.0.0-1-any.pkg.tar.xz
  - mingw-w64-ucrt-x86_64-aws-sdk-cpp-1.7.365-1-any.pkg.tar.xz
  - mingw-w64-ucrt-x86_64-brotli-1.0.9-4-any.pkg.tar.xz
- - mingw-w64-ucrt-x86_64-openssl-1.1.1.m-9800-any.pkg.tar.xz
+ - mingw-w64-ucrt-x86_64-openssl-3.1.1-9800-any.pkg.tar.xz
  - mingw-w64-ucrt-x86_64-lz4-1.8.2-1-any.pkg.tar.xz
  - mingw-w64-ucrt-x86_64-re2-20200801-1-any.pkg.tar.xz
  - mingw-w64-ucrt-x86_64-snappy-1.1.7-2-any.pkg.tar.xz
  - mingw-w64-ucrt-x86_64-bzip2-1.0.8-1-any.pkg.tar.xz
- - mingw-w64-ucrt-x86_64-curl-7.84.0-9000-any.pkg.tar.xz
- - mingw-w64-ucrt-x86_64-libssh2-1.11.0-9800-any.pkg.tar.xz
+ - mingw-w64-ucrt-x86_64-curl-8.1.2-9000-any.pkg.tar.xz
+ - mingw-w64-ucrt-x86_64-libssh2-1.11.0-9801-any.pkg.tar.xz
  - mingw-w64-ucrt-x86_64-thrift-0.13.0-1-any.pkg.tar.xz
  - mingw-w64-ucrt-x86_64-zstd-1.4.4-1-any.pkg.tar.xz
  - mingw-w64-ucrt-x86_64-libutf8proc-2.4.0-2-any.pkg.tar.xz
diff --git a/include/arrow/acero/api.h b/include/arrow/acero/api.h
new file mode 100644
index 0000000..c9724fd
--- /dev/null
+++ b/include/arrow/acero/api.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+/// \defgroup acero-api Utilities for creating and executing execution plans
+/// @{
+/// @}
+
+/// \defgroup acero-nodes Options classes for the various exec nodes
+/// @{
+/// @}
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/acero/options.h"
diff --git a/include/arrow/acero/asof_join_node.h b/include/arrow/acero/asof_join_node.h
index b2ad2ed..6a0ce8f 100644
--- a/include/arrow/acero/asof_join_node.h
+++ b/include/arrow/acero/asof_join_node.h
@@ -30,16 +30,11 @@ using AsofJoinKeys = AsofJoinNodeOptions::Keys;
 
 /// \brief Make the output schema of an as-of-join node
 ///
-/// Optionally, also provides the field output indices for this node.
-/// \see arrow::engine::RelationInfo
-///
 /// \param[in] input_schema the schema of each input to the node
 /// \param[in] input_keys the key of each input to the node
-/// \param[out] field_output_indices the output index of each field
 ARROW_ACERO_EXPORT Result<std::shared_ptr<Schema>> MakeOutputSchema(
     const std::vector<std::shared_ptr<Schema>>& input_schema,
-    const std::vector<AsofJoinKeys>& input_keys,
-    std::vector<int>* field_output_indices = NULLPTR);
+    const std::vector<AsofJoinKeys>& input_keys);
 
 }  // namespace asofjoin
 }  // namespace acero
diff --git a/include/arrow/acero/exec_plan.h b/include/arrow/acero/exec_plan.h
index cdaab96..04303aa 100644
--- a/include/arrow/acero/exec_plan.h
+++ b/include/arrow/acero/exec_plan.h
@@ -48,7 +48,7 @@ using compute::threaded_exec_context;
 
 namespace acero {
 
-/// \addtogroup execnode-components
+/// \addtogroup acero-internals
 /// @{
 
 class ARROW_ACERO_EXPORT ExecPlan : public std::enable_shared_from_this<ExecPlan> {
@@ -118,6 +118,10 @@ class ARROW_ACERO_EXPORT ExecPlan : public std::enable_shared_from_this<ExecPlan
   std::string ToString() const;
 };
 
+// Acero can be extended by providing custom implementations of ExecNode.  The methods
+// below are documented in detail and provide careful instruction on how to fulfill the
+// ExecNode contract.  It's suggested you familiarize yourself with the Acero
+// documentation in the C++ user guide.
 class ARROW_ACERO_EXPORT ExecNode {
  public:
   using NodeVector = std::vector<ExecNode*>;
@@ -173,9 +177,9 @@ class ARROW_ACERO_EXPORT ExecNode {
   /// non-deterministic.  For example, a hash-join has no predictable output order.
   ///
   /// If the ordering is Ordering::Implicit then there is a meaningful order but that
-  /// odering is not represented by any column in the data.  The most common case for this
-  /// is when reading data from an in-memory table.  The data has an implicit "row order"
-  /// which is not neccesarily represented in the data set.
+  /// ordering is not represented by any column in the data.  The most common case for
+  /// this is when reading data from an in-memory table.  The data has an implicit "row
+  /// order" which is not necessarily represented in the data set.
   ///
   /// A filter or project node will not modify the ordering.  Nothing needs to be done
   /// other than ensure the index assigned to output batches is the same as the
@@ -321,7 +325,7 @@ class ARROW_ACERO_EXPORT ExecNode {
   ///
   /// This is not a pause.  There will be no way to start the source again after this has
   /// been called.
-  Status StopProducing();
+  virtual Status StopProducing();
 
   std::string ToString(int indent = 0) const;
 
@@ -377,16 +381,36 @@ inline Result<ExecNode*> MakeExecNode(
   return factory(plan, std::move(inputs), options);
 }
 
-/// \brief Helper class for declaring sets of ExecNodes efficiently
+/// @}
+
+/// \addtogroup acero-api
+/// @{
+
+/// \brief Helper class for declaring execution nodes
 ///
-/// A Declaration represents an unconstructed ExecNode (and potentially more since its
-/// inputs may also be Declarations). The node can be constructed and added to a plan
-/// with Declaration::AddToPlan, which will recursively construct any inputs as necessary.
+/// A Declaration represents an unconstructed ExecNode (and potentially an entire graph
+/// since its inputs may also be Declarations)
+///
+/// A Declaration can be converted to a plan and executed using one of the
+/// DeclarationToXyz methods.
+///
+/// For more direct control, a Declaration can be added to an existing execution
+/// plan with Declaration::AddToPlan, which will recursively construct any inputs as
+/// necessary.
 struct ARROW_ACERO_EXPORT Declaration {
   using Input = std::variant<ExecNode*, Declaration>;
 
   Declaration() {}
 
+  /// \brief construct a declaration
+  /// \param factory_name the name of the exec node to construct.  The node must have
+  ///                     been added to the exec node registry with this name.
+  /// \param inputs the inputs to the node, these should be other declarations
+  /// \param options options that control the behavior of the node.  You must use
+  ///                the appropriate subclass.  For example, if `factory_name` is
+  ///                "project" then `options` should be ProjectNodeOptions.
+  /// \param label a label to give the node.  Can be used to distinguish it from other
+  ///              nodes of the same type in the plan.
   Declaration(std::string factory_name, std::vector<Input> inputs,
               std::shared_ptr<ExecNodeOptions> options, std::string label)
       : factory_name{std::move(factory_name)},
@@ -447,15 +471,28 @@ struct ARROW_ACERO_EXPORT Declaration {
   ///     });
   static Declaration Sequence(std::vector<Declaration> decls);
 
+  /// \brief add the declaration to an already created execution plan
+  /// \param plan the plan to add the node to
+  /// \param registry the registry to use to lookup the node factory
+  ///
+  /// This method will recursively call AddToPlan on all of the declaration's inputs.
+  /// This method is only for advanced use when the DeclarationToXyz methods are not
+  /// sufficient.
+  ///
+  /// \return the instantiated execution node
   Result<ExecNode*> AddToPlan(ExecPlan* plan, ExecFactoryRegistry* registry =
                                                   default_exec_factory_registry()) const;
 
   // Validate a declaration
   bool IsValid(ExecFactoryRegistry* registry = default_exec_factory_registry()) const;
 
+  /// \brief the name of the factory to use when creating a node
   std::string factory_name;
+  /// \brief the declarations's inputs
   std::vector<Input> inputs;
+  /// \brief options to control the behavior of the node
   std::shared_ptr<ExecNodeOptions> options;
+  /// \brief a label to give the node in the plan
   std::string label;
 };
 
@@ -489,7 +526,7 @@ struct ARROW_ACERO_EXPORT QueryOptions {
   /// otherwise.
   ///
   /// If explicitly set to true then plan execution will fail if there is no
-  /// meaningful ordering.  This can be useful to valdiate a query that should
+  /// meaningful ordering.  This can be useful to validate a query that should
   /// be emitting ordered results.
   ///
   /// If explicitly set to false then batches will be emit immediately even if there
@@ -513,6 +550,13 @@ struct ARROW_ACERO_EXPORT QueryOptions {
   /// the `use_threads` option.
   ::arrow::internal::Executor* custom_cpu_executor = NULLPTR;
 
+  /// \brief custom executor to use for IO work
+  ///
+  /// Must be null or remain valid for the duration of the plan.  If this is null then
+  /// the global io thread pool will be chosen whose behavior will be controlled by
+  /// the "ARROW_IO_THREADS" environment.
+  ::arrow::internal::Executor* custom_io_executor = NULLPTR;
+
   /// \brief a memory pool to use for allocations
   ///
   /// Must remain valid for the duration of the plan.
@@ -707,6 +751,10 @@ DeclarationToBatchesAsync(Declaration declaration, ExecContext exec_context);
 /// fills up.
 ///
 /// If a custom exec context is provided then the value of `use_threads` will be ignored.
+///
+/// The returned RecordBatchReader can be closed early to cancel the computation of record
+/// batches. In this case, only errors encountered by the computation may be reported. In
+/// particular, no cancellation error may be reported.
 ARROW_ACERO_EXPORT Result<std::unique_ptr<RecordBatchReader>> DeclarationToReader(
     Declaration declaration, bool use_threads = true,
     MemoryPool* memory_pool = default_memory_pool(),
@@ -746,6 +794,8 @@ ARROW_ACERO_EXPORT Future<> DeclarationToStatusAsync(
 ARROW_ACERO_EXPORT Future<> DeclarationToStatusAsync(Declaration declaration,
                                                      ExecContext exec_context);
 
+/// @}
+
 /// \brief Wrap an ExecBatch generator in a RecordBatchReader.
 ///
 /// The RecordBatchReader does not impose any ordering on emitted batches.
@@ -765,7 +815,5 @@ Result<std::function<Future<std::optional<ExecBatch>>()>> MakeReaderGenerator(
     std::shared_ptr<RecordBatchReader> reader, arrow::internal::Executor* io_executor,
     int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart);
 
-/// @}
-
 }  // namespace acero
 }  // namespace arrow
diff --git a/include/arrow/acero/groupby.h b/include/arrow/acero/groupby.h
deleted file mode 100644
index c24990a..0000000
--- a/include/arrow/acero/groupby.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include "arrow/acero/exec_plan.h"
-#include "arrow/acero/options.h"
-#include "arrow/acero/visibility.h"
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/exec.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/datum.h"
-#include "arrow/result.h"
-
-namespace arrow {
-namespace acero {
-
-/// Convenience function to perform a group-by on a table
-///
-/// The result will be calculated using an exec plan with an aggregate node.
-///
-/// If there are no arguments/aggregates then the returned table will have one row
-/// for each unique combination of keys
-///
-/// Note: If there are many groups the output table may have multiple chunks.
-///
-/// If there are no keys then the aggregates will be applied to the full table.
-/// The output table in this scenario is guaranteed to have exactly 1 row.
-///
-/// \return a table that will have one column for each aggregate, named after they
-/// aggregate function, and one column for each key
-ARROW_ACERO_EXPORT
-Result<std::shared_ptr<Table>> TableGroupBy(
-    std::shared_ptr<Table> table, std::vector<Aggregate> aggregates,
-    std::vector<FieldRef> keys, bool use_threads = false,
-    MemoryPool* memory_pool = default_memory_pool());
-
-/// Convenience function to perform a group-by on a record batch
-///
-/// \see GroupByTable
-ARROW_ACERO_EXPORT
-Result<std::shared_ptr<Table>> BatchGroupBy(
-    std::shared_ptr<RecordBatch> record_batch, std::vector<Aggregate> aggregates,
-    std::vector<FieldRef> keys, bool use_threads = false,
-    MemoryPool* memory_pool = default_memory_pool());
-
-}  // namespace acero
-}  // namespace arrow
diff --git a/include/arrow/acero/options.h b/include/arrow/acero/options.h
index 635f8a1..bb94bda 100644
--- a/include/arrow/acero/options.h
+++ b/include/arrow/acero/options.h
@@ -52,53 +52,80 @@ class Executor;
 
 namespace acero {
 
+/// \brief This must not be used in release-mode
+struct DebugOptions;
+
 using AsyncExecBatchGenerator = AsyncGenerator<std::optional<ExecBatch>>;
 
-/// \addtogroup execnode-options
+/// \addtogroup acero-nodes
 /// @{
+
+/// \brief A base class for all options objects
+///
+/// The only time this is used directly is when a node has no configuration
 class ARROW_ACERO_EXPORT ExecNodeOptions {
  public:
   virtual ~ExecNodeOptions() = default;
+
+  /// \brief This must not be used in release-mode
+  std::shared_ptr<DebugOptions> debug_opts;
 };
 
-/// \brief Adapt an AsyncGenerator<ExecBatch> as a source node
+/// \brief A node representing a generic source of data for Acero
+///
+/// The source node will start calling `generator` during StartProducing.  An initial
+/// task will be created that will call `generator`.  It will not call `generator`
+/// reentrantly.  If the source can be read in parallel then those details should be
+/// encapsulated within `generator`.
 ///
-/// plan->exec_context()->executor() will be used to parallelize pushing to
-/// outputs, if provided.
+/// For each batch received a new task will be created to push that batch downstream.
+/// This task will slice smaller units of size `ExecPlan::kMaxBatchSize` from the
+/// parent batch and call InputRecieved.  Thus, if the `generator` yields a large
+/// batch it may result in several calls to InputReceived.
+///
+/// The SourceNode will, by default, assign an implicit ordering to outgoing batches.
+/// This is valid as long as the generator generates batches in a deterministic fashion.
+/// Currently, the only way to override this is to subclass the SourceNode.
+///
+/// This node is not generally used directly but can serve as the basis for various
+/// specialized nodes.
 class ARROW_ACERO_EXPORT SourceNodeOptions : public ExecNodeOptions {
  public:
+  /// Create an instance from values
   SourceNodeOptions(std::shared_ptr<Schema> output_schema,
                     std::function<Future<std::optional<ExecBatch>>()> generator)
       : output_schema(std::move(output_schema)), generator(std::move(generator)) {}
 
-  static Result<std::shared_ptr<SourceNodeOptions>> FromTable(const Table& table,
-                                                              arrow::internal::Executor*);
-
-  static Result<std::shared_ptr<SourceNodeOptions>> FromRecordBatchReader(
-      std::shared_ptr<RecordBatchReader> reader, std::shared_ptr<Schema> schema,
-      arrow::internal::Executor*);
-
+  /// \brief the schema for batches that will be generated by this source
   std::shared_ptr<Schema> output_schema;
+  /// \brief an asynchronous stream of batches ending with std::nullopt
   std::function<Future<std::optional<ExecBatch>>()> generator;
 };
 
-/// \brief An extended Source node which accepts a table
+/// \brief a node that generates data from a table already loaded in memory
+///
+/// The table source node will slice off chunks, defined by `max_batch_size`
+/// for parallel processing.  The source node extends source node and so these
+/// chunks will be iteratively processed in small batches.  \see SourceNode
+/// for details.
 class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions {
  public:
   static constexpr int64_t kDefaultMaxBatchSize = 1 << 20;
+
+  /// Create an instance from values
   TableSourceNodeOptions(std::shared_ptr<Table> table,
                          int64_t max_batch_size = kDefaultMaxBatchSize)
       : table(table), max_batch_size(max_batch_size) {}
 
-  // arrow table which acts as the data source
+  /// \brief a table which acts as the data source
   std::shared_ptr<Table> table;
-  // Size of batches to emit from this node
-  // If the table is larger the node will emit multiple batches from the
-  // the table to be processed in parallel.
+  /// \brief size of batches to emit from this node
+  /// If the table is larger the node will emit multiple batches from the
+  /// the table to be processed in parallel.
   int64_t max_batch_size;
 };
 
-/// \brief Define a lazy resolved Arrow table.
+/// \brief define a lazily resolved Arrow table.
 ///
 /// The table uniquely identified by the names can typically be resolved at the time when
 /// the plan is to be consumed.
@@ -106,19 +133,27 @@ class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions {
 /// This node is for serialization purposes only and can never be executed.
 class ARROW_ACERO_EXPORT NamedTableNodeOptions : public ExecNodeOptions {
  public:
+  /// Create an instance from values
   NamedTableNodeOptions(std::vector<std::string> names, std::shared_ptr<Schema> schema)
       : names(std::move(names)), schema(schema) {}
 
+  /// \brief the names to put in the serialized plan
   std::vector<std::string> names;
+  /// \brief the output schema of the table
   std::shared_ptr<Schema> schema;
 };
 
-/// \brief An extended Source node which accepts a schema
+/// \brief a source node which feeds data from a synchronous iterator of batches
 ///
 /// ItMaker is a maker of an iterator of tabular data.
+///
+/// The node can be configured to use an I/O executor.  If set then each time the
+/// iterator is polled a new I/O thread task will be created to do the polling.  This
+/// allows a blocking iterator to stay off the CPU thread pool.
 template <typename ItMaker>
 class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions {
  public:
+  /// Create an instance that will create a new task on io_executor for each iteration
   SchemaSourceNodeOptions(std::shared_ptr<Schema> schema, ItMaker it_maker,
                           arrow::internal::Executor* io_executor)
       : schema(schema),
@@ -126,6 +161,8 @@ class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions {
         io_executor(io_executor),
         requires_io(true) {}
 
+  /// Create an instance that will either iterate synchronously or use the default I/O
+  /// executor
   SchemaSourceNodeOptions(std::shared_ptr<Schema> schema, ItMaker it_maker,
                           bool requires_io = false)
       : schema(schema),
@@ -150,8 +187,13 @@ class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions {
   bool requires_io;
 };
 
+/// a source node that reads from a RecordBatchReader
+///
+/// Each iteration of the RecordBatchReader will be run on a new thread task created
+/// on the I/O thread pool.
 class ARROW_ACERO_EXPORT RecordBatchReaderSourceNodeOptions : public ExecNodeOptions {
  public:
+  /// Create an instance from values
   RecordBatchReaderSourceNodeOptions(std::shared_ptr<RecordBatchReader> reader,
                                      arrow::internal::Executor* io_executor = NULLPTR)
       : reader(std::move(reader)), io_executor(io_executor) {}
@@ -165,6 +207,7 @@ class ARROW_ACERO_EXPORT RecordBatchReaderSourceNodeOptions : public ExecNodeOpt
   arrow::internal::Executor* io_executor;
 };
 
+/// a source node that reads from an iterator of array vectors
 using ArrayVectorIteratorMaker = std::function<Iterator<std::shared_ptr<ArrayVector>>()>;
 /// \brief An extended Source node which accepts a schema and array-vectors
 class ARROW_ACERO_EXPORT ArrayVectorSourceNodeOptions
@@ -172,6 +215,7 @@ class ARROW_ACERO_EXPORT ArrayVectorSourceNodeOptions
   using SchemaSourceNodeOptions::SchemaSourceNodeOptions;
 };
 
+/// a source node that reads from an iterator of ExecBatch
 using ExecBatchIteratorMaker = std::function<Iterator<std::shared_ptr<ExecBatch>>()>;
 /// \brief An extended Source node which accepts a schema and exec-batches
 class ARROW_ACERO_EXPORT ExecBatchSourceNodeOptions
@@ -186,34 +230,45 @@ class ARROW_ACERO_EXPORT ExecBatchSourceNodeOptions
 };
 
 using RecordBatchIteratorMaker = std::function<Iterator<std::shared_ptr<RecordBatch>>()>;
-/// \brief An extended Source node which accepts a schema and record-batches
+/// a source node that reads from an iterator of RecordBatch
 class ARROW_ACERO_EXPORT RecordBatchSourceNodeOptions
     : public SchemaSourceNodeOptions<RecordBatchIteratorMaker> {
   using SchemaSourceNodeOptions::SchemaSourceNodeOptions;
 };
 
-/// \brief Make a node which excludes some rows from batches passed through it
+/// \brief a node which excludes some rows from batches passed through it
 ///
 /// filter_expression will be evaluated against each batch which is pushed to
 /// this node. Any rows for which filter_expression does not evaluate to `true` will be
 /// excluded in the batch emitted by this node.
+///
+/// This node will emit empty batches if all rows are excluded.  This is done
+/// to avoid gaps in the ordering.
 class ARROW_ACERO_EXPORT FilterNodeOptions : public ExecNodeOptions {
  public:
+  /// \brief create an instance from values
   explicit FilterNodeOptions(Expression filter_expression)
       : filter_expression(std::move(filter_expression)) {}
 
+  /// \brief the expression to filter batches
+  ///
+  /// The return type of this expression must be boolean
   Expression filter_expression;
 };
 
+/// \brief a node which selects a specified subset from the input
 class ARROW_ACERO_EXPORT FetchNodeOptions : public ExecNodeOptions {
  public:
   static constexpr std::string_view kName = "fetch";
+  /// \brief create an instance from values
   FetchNodeOptions(int64_t offset, int64_t count) : offset(offset), count(count) {}
+  /// \brief the number of rows to skip
   int64_t offset;
+  /// \brief the number of rows to keep (not counting skipped rows)
   int64_t count;
 };
 
-/// \brief Make a node which executes expressions on input batches, producing batches
+/// \brief a node which executes expressions on input batches, producing batches
 /// of the same length with new columns.
 ///
 /// Each expression will be evaluated against each batch which is pushed to
@@ -222,21 +277,44 @@ class ARROW_ACERO_EXPORT FetchNodeOptions : public ExecNodeOptions {
 /// If names are not provided, the string representations of exprs will be used.
 class ARROW_ACERO_EXPORT ProjectNodeOptions : public ExecNodeOptions {
  public:
+  /// \brief create an instance from values
   explicit ProjectNodeOptions(std::vector<Expression> expressions,
                               std::vector<std::string> names = {})
       : expressions(std::move(expressions)), names(std::move(names)) {}
 
+  /// \brief the expressions to run on the batches
+  ///
+  /// The output will have one column for each expression.  If you wish to keep any of
+  /// the columns from the input then you should create a simple field_ref expression
+  /// for that column.
   std::vector<Expression> expressions;
+  /// \brief the names of the output columns
+  ///
+  /// If this is not specified then the result of calling ToString on the expression will
+  /// be used instead
+  ///
+  /// This list should either be empty or have the same length as `expressions`
   std::vector<std::string> names;
 };
 
-/// \brief Make a node which aggregates input batches, optionally grouped by keys and
-/// optionally segmented by segment-keys. Both keys and segment-keys determine the group.
-/// However segment-keys are also used for determining grouping segments, which should be
-/// large, and allow streaming a partial aggregation result after processing each segment.
-/// One common use-case for segment-keys is ordered aggregation, in which the segment-key
-/// attribute specifies a column with non-decreasing values or a lexicographically-ordered
-/// set of such columns.
+/// \brief a node which aggregates input batches and calculates summary statistics
+///
+/// The node can summarize the entire input or it can group the input with grouping keys
+/// and segment keys.
+///
+/// By default, the aggregate node is a pipeline breaker.  It must accumulate all input
+/// before any output is produced.  Segment keys are a performance optimization.  If
+/// you know your input is already partitioned by one or more columns then you can
+/// specify these as segment keys.  At each change in the segment keys the node will
+/// emit values for all data seen so far.
+///
+/// Segment keys are currently limited to single-threaded mode.
+///
+/// Both keys and segment-keys determine the group.  However segment-keys are also used
+/// for determining grouping segments, which should be large, and allow streaming a
+/// partial aggregation result after processing each segment.  One common use-case for
+/// segment-keys is ordered aggregation, in which the segment-key attribute specifies a
+/// column with non-decreasing values or a lexicographically-ordered set of such columns.
 ///
 /// If the keys attribute is a non-empty vector, then each aggregate in `aggregates` is
 /// expected to be a HashAggregate function. If the keys attribute is an empty vector,
@@ -246,8 +324,14 @@ class ARROW_ACERO_EXPORT ProjectNodeOptions : public ExecNodeOptions {
 /// described above, applies.
 ///
 /// The keys and segment_keys vectors must be disjoint.
+///
+/// If no measures are provided then you will simply get the list of unique keys.
+///
+/// This node outputs segment keys first, followed by regular keys, followed by one
+/// column for each aggregate.
 class ARROW_ACERO_EXPORT AggregateNodeOptions : public ExecNodeOptions {
  public:
+  /// \brief create an instance from values
   explicit AggregateNodeOptions(std::vector<Aggregate> aggregates,
                                 std::vector<FieldRef> keys = {},
                                 std::vector<FieldRef> segment_keys = {})
@@ -255,7 +339,7 @@ class ARROW_ACERO_EXPORT AggregateNodeOptions : public ExecNodeOptions {
         keys(std::move(keys)),
         segment_keys(std::move(segment_keys)) {}
 
-  // aggregations which will be applied to the targetted fields
+  // aggregations which will be applied to the targeted fields
   std::vector<Aggregate> aggregates;
   // keys by which aggregations will be grouped (optional)
   std::vector<FieldRef> keys;
@@ -263,13 +347,18 @@ class ARROW_ACERO_EXPORT AggregateNodeOptions : public ExecNodeOptions {
   std::vector<FieldRef> segment_keys;
 };
 
+/// \brief a default value at which backpressure will be applied
 constexpr int32_t kDefaultBackpressureHighBytes = 1 << 30;  // 1GiB
-constexpr int32_t kDefaultBackpressureLowBytes = 1 << 28;   // 256MiB
+/// \brief a default value at which backpressure will be removed
+constexpr int32_t kDefaultBackpressureLowBytes = 1 << 28;  // 256MiB
 
+/// \brief an interface that can be queried for backpressure statistics
 class ARROW_ACERO_EXPORT BackpressureMonitor {
  public:
   virtual ~BackpressureMonitor() = default;
+  /// \brief fetches the number of bytes currently queued up
   virtual uint64_t bytes_in_use() = 0;
+  /// \brief checks to see if backpressure is currently applied
   virtual bool is_paused() = 0;
 };
 
@@ -286,18 +375,25 @@ struct ARROW_ACERO_EXPORT BackpressureOptions {
   BackpressureOptions(uint64_t resume_if_below, uint64_t pause_if_above)
       : resume_if_below(resume_if_below), pause_if_above(pause_if_above) {}
 
+  /// \brief create an instance using default values for backpressure limits
   static BackpressureOptions DefaultBackpressure() {
     return BackpressureOptions(kDefaultBackpressureLowBytes,
                                kDefaultBackpressureHighBytes);
   }
 
+  /// \brief helper method to determine if backpressure is disabled
+  /// \return true if pause_if_above is greater than zero, false otherwise
   bool should_apply_backpressure() const { return pause_if_above > 0; }
 
+  /// \brief the number of bytes at which the producer should resume producing
   uint64_t resume_if_below;
+  /// \brief the number of bytes at which the producer should pause producing
+  ///
+  /// If this is <= 0 then backpressure will be disabled
   uint64_t pause_if_above;
 };
 
-/// \brief Add a sink node which forwards to an AsyncGenerator<ExecBatch>
+/// \brief a sink node which collects results in a queue
 ///
 /// Emitted batches will only be ordered if there is a meaningful ordering
 /// and sequence_output is not set to false.
@@ -369,6 +465,7 @@ class ARROW_ACERO_EXPORT BackpressureControl {
   virtual void Resume() = 0;
 };
 
+/// \brief a sink node that consumes the data as part of the plan using callbacks
 class ARROW_ACERO_EXPORT SinkNodeConsumer {
  public:
   virtual ~SinkNodeConsumer() = default;
@@ -420,11 +517,13 @@ class ARROW_ACERO_EXPORT ConsumingSinkNodeOptions : public ExecNodeOptions {
 /// fields. Then sorted batches will be forwarded to the generator in sorted order.
 class ARROW_ACERO_EXPORT OrderBySinkNodeOptions : public SinkNodeOptions {
  public:
+  /// \brief create an instance from values
   explicit OrderBySinkNodeOptions(
       SortOptions sort_options,
       std::function<Future<std::optional<ExecBatch>>()>* generator)
       : SinkNodeOptions(generator), sort_options(std::move(sort_options)) {}
 
+  /// \brief options describing which columns and direction to sort
   SortOptions sort_options;
 };
 
@@ -443,8 +542,6 @@ class ARROW_ACERO_EXPORT OrderByNodeOptions : public ExecNodeOptions {
   Ordering ordering;
 };
 
-/// @}
-
 enum class JoinType {
   LEFT_SEMI,
   RIGHT_SEMI,
@@ -460,14 +557,12 @@ std::string ToString(JoinType t);
 
 enum class JoinKeyCmp { EQ, IS };
 
-/// \addtogroup execnode-options
-/// @{
-
-/// \brief Make a node which implements join operation using hash join strategy.
+/// \brief a node which implements a join operation using a hash table
 class ARROW_ACERO_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
  public:
   static constexpr const char* default_output_suffix_for_left = "";
   static constexpr const char* default_output_suffix_for_right = "";
+  /// \brief create an instance from values that outputs all columns
   HashJoinNodeOptions(
       JoinType in_join_type, std::vector<FieldRef> in_left_keys,
       std::vector<FieldRef> in_right_keys, Expression filter = literal(true),
@@ -487,6 +582,13 @@ class ARROW_ACERO_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
       this->key_cmp[i] = JoinKeyCmp::EQ;
     }
   }
+  /// \brief create an instance from keys
+  ///
+  /// This will create an inner join that outputs all columns and has no post join filter
+  ///
+  /// `in_left_keys` should have the same length and types as `in_right_keys`
+  /// @param in_left_keys the keys in the left input
+  /// @param in_right_keys the keys in the right input
   HashJoinNodeOptions(std::vector<FieldRef> in_left_keys,
                       std::vector<FieldRef> in_right_keys)
       : left_keys(std::move(in_left_keys)), right_keys(std::move(in_right_keys)) {
@@ -500,6 +602,7 @@ class ARROW_ACERO_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
     }
     this->filter = literal(true);
   }
+  /// \brief create an instance from values using JoinKeyCmp::EQ for all comparisons
   HashJoinNodeOptions(
       JoinType join_type, std::vector<FieldRef> left_keys,
       std::vector<FieldRef> right_keys, std::vector<FieldRef> left_output,
@@ -522,6 +625,7 @@ class ARROW_ACERO_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
       this->key_cmp[i] = JoinKeyCmp::EQ;
     }
   }
+  /// \brief create an instance from values
   HashJoinNodeOptions(
       JoinType join_type, std::vector<FieldRef> left_keys,
       std::vector<FieldRef> right_keys, std::vector<FieldRef> left_output,
@@ -575,7 +679,7 @@ class ARROW_ACERO_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
   bool disable_bloom_filter = false;
 };
 
-/// \brief Make a node which implements asof join operation
+/// \brief a node which implements the asof join operation
 ///
 /// Note, this API is experimental and will change in the future
 ///
@@ -625,7 +729,7 @@ class ARROW_ACERO_EXPORT AsofJoinNodeOptions : public ExecNodeOptions {
   int64_t tolerance;
 };
 
-/// \brief Make a node which select top_k/bottom_k rows passed through it
+/// \brief a node which select top_k/bottom_k rows passed through it
 ///
 /// All batches pushed to this node will be accumulated, then selected, by the given
 /// fields. Then sorted batches will be forwarded to the generator in sorted order.
@@ -640,16 +744,18 @@ class ARROW_ACERO_EXPORT SelectKSinkNodeOptions : public SinkNodeOptions {
   SelectKOptions select_k_options;
 };
 
-/// \brief Adapt a Table as a sink node
-///
-/// obtains the output of an execution plan to
-/// a table pointer.
+/// \brief a sink node which accumulates all output into a table
 class ARROW_ACERO_EXPORT TableSinkNodeOptions : public ExecNodeOptions {
  public:
+  /// \brief create an instance from values
   explicit TableSinkNodeOptions(std::shared_ptr<Table>* output_table,
                                 std::optional<bool> sequence_output = std::nullopt)
       : output_table(output_table), sequence_output(sequence_output) {}
 
+  /// \brief an "out parameter" specifying the table that will be created
+  ///
+  /// Must not be null and remain valid for the entirety of the plan execution.  After the
+  /// plan has completed this will be set to point to the result table
   std::shared_ptr<Table>* output_table;
   /// \brief Controls whether batches should be emitted immediately or sequenced in order
   ///
@@ -664,6 +770,7 @@ class ARROW_ACERO_EXPORT TableSinkNodeOptions : public ExecNodeOptions {
   std::vector<std::string> names;
 };
 
+/// \brief a row template that describes one row that will be generated for each input row
 struct ARROW_ACERO_EXPORT PivotLongerRowTemplate {
   PivotLongerRowTemplate(std::vector<std::string> feature_values,
                          std::vector<std::optional<FieldRef>> measurement_values)
@@ -699,6 +806,7 @@ struct ARROW_ACERO_EXPORT PivotLongerRowTemplate {
 /// "location" (left vs right) and a measurement "temp".  What we really want is:
 ///
 /// | time | location | temp |
+/// | ---  | ---      | ---  |
 /// | 1    | left     | 10   |
 /// | 1    | right    | 20   |
 /// | 2    | left     | 15   |
diff --git a/include/arrow/acero/test_nodes.h b/include/arrow/acero/test_nodes.h
index 2d1d630..7e31aa3 100644
--- a/include/arrow/acero/test_nodes.h
+++ b/include/arrow/acero/test_nodes.h
@@ -53,6 +53,33 @@ struct JitterNodeOptions : public ExecNodeOptions {
   static constexpr std::string_view kName = "jitter";
 };
 
+class GateImpl;
+
+class Gate {
+ public:
+  static std::shared_ptr<Gate> Make();
+
+  Gate();
+  virtual ~Gate();
+
+  void ReleaseAllBatches();
+  void ReleaseOneBatch();
+  Future<> WaitForNextReleasedBatch();
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Gate);
+
+  GateImpl* impl_;
+};
+
+// A node that holds all input batches until a given gate is released
+struct GatedNodeOptions : public ExecNodeOptions {
+  explicit GatedNodeOptions(Gate* gate) : gate(gate) {}
+  Gate* gate;
+
+  static constexpr std::string_view kName = "gated";
+};
+
 void RegisterTestNodes();
 
 }  // namespace acero
diff --git a/include/arrow/api.h b/include/arrow/api.h
index 5466af8..ac568a0 100644
--- a/include/arrow/api.h
+++ b/include/arrow/api.h
@@ -19,29 +19,29 @@
 
 #pragma once
 
-#include "arrow/array.h"                    // IYWU pragma: export
-#include "arrow/array/array_run_end.h"      // IYWU pragma: export
-#include "arrow/array/concatenate.h"        // IYWU pragma: export
-#include "arrow/buffer.h"                   // IYWU pragma: export
-#include "arrow/builder.h"                  // IYWU pragma: export
-#include "arrow/chunked_array.h"            // IYWU pragma: export
-#include "arrow/compare.h"                  // IYWU pragma: export
-#include "arrow/config.h"                   // IYWU pragma: export
-#include "arrow/datum.h"                    // IYWU pragma: export
-#include "arrow/extension_type.h"           // IYWU pragma: export
-#include "arrow/memory_pool.h"              // IYWU pragma: export
-#include "arrow/pretty_print.h"             // IYWU pragma: export
-#include "arrow/record_batch.h"             // IYWU pragma: export
-#include "arrow/result.h"                   // IYWU pragma: export
-#include "arrow/status.h"                   // IYWU pragma: export
-#include "arrow/table.h"                    // IYWU pragma: export
-#include "arrow/table_builder.h"            // IYWU pragma: export
-#include "arrow/tensor.h"                   // IYWU pragma: export
-#include "arrow/type.h"                     // IYWU pragma: export
+#include "arrow/array.h"                    // IWYU pragma: export
+#include "arrow/array/array_run_end.h"      // IWYU pragma: export
+#include "arrow/array/concatenate.h"        // IWYU pragma: export
+#include "arrow/buffer.h"                   // IWYU pragma: export
+#include "arrow/builder.h"                  // IWYU pragma: export
+#include "arrow/chunked_array.h"            // IWYU pragma: export
+#include "arrow/compare.h"                  // IWYU pragma: export
+#include "arrow/config.h"                   // IWYU pragma: export
+#include "arrow/datum.h"                    // IWYU pragma: export
+#include "arrow/extension_type.h"           // IWYU pragma: export
+#include "arrow/memory_pool.h"              // IWYU pragma: export
+#include "arrow/pretty_print.h"             // IWYU pragma: export
+#include "arrow/record_batch.h"             // IWYU pragma: export
+#include "arrow/result.h"                   // IWYU pragma: export
+#include "arrow/status.h"                   // IWYU pragma: export
+#include "arrow/table.h"                    // IWYU pragma: export
+#include "arrow/table_builder.h"            // IWYU pragma: export
+#include "arrow/tensor.h"                   // IWYU pragma: export
+#include "arrow/type.h"                     // IWYU pragma: export
 #include "arrow/util/key_value_metadata.h"  // IWYU pragma: export
-#include "arrow/visit_array_inline.h"       // IYWU pragma: export
-#include "arrow/visit_scalar_inline.h"      // IYWU pragma: export
-#include "arrow/visitor.h"                  // IYWU pragma: export
+#include "arrow/visit_array_inline.h"       // IWYU pragma: export
+#include "arrow/visit_scalar_inline.h"      // IWYU pragma: export
+#include "arrow/visitor.h"                  // IWYU pragma: export
 
 /// \brief Top-level namespace for Apache Arrow C++ API
 namespace arrow {}
diff --git a/include/arrow/array/array_nested.h b/include/arrow/array/array_nested.h
index 6fb3fd3..4f5f3f6 100644
--- a/include/arrow/array/array_nested.h
+++ b/include/arrow/array/array_nested.h
@@ -234,6 +234,10 @@ class ARROW_EXPORT MapArray : public ListArray {
            const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
            int64_t null_count = kUnknownNullCount, int64_t offset = 0);
 
+  MapArray(const std::shared_ptr<DataType>& type, int64_t length, BufferVector buffers,
+           const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+           int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
   MapArray(const std::shared_ptr<DataType>& type, int64_t length,
            const std::shared_ptr<Buffer>& value_offsets,
            const std::shared_ptr<Array>& values,
diff --git a/include/arrow/array/builder_binary.h b/include/arrow/array/builder_binary.h
index c9020f3..b0c4fe2 100644
--- a/include/arrow/array/builder_binary.h
+++ b/include/arrow/array/builder_binary.h
@@ -67,7 +67,7 @@ class BaseBinaryBuilder
 
   Status Append(const uint8_t* value, offset_type length) {
     ARROW_RETURN_NOT_OK(Reserve(1));
-    ARROW_RETURN_NOT_OK(AppendNextOffset());
+    UnsafeAppendNextOffset();
     // Safety check for UBSAN.
     if (ARROW_PREDICT_TRUE(length > 0)) {
       ARROW_RETURN_NOT_OK(ValidateOverflow(length));
@@ -114,15 +114,15 @@ class BaseBinaryBuilder
   }
 
   Status AppendNull() final {
-    ARROW_RETURN_NOT_OK(AppendNextOffset());
     ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppendNextOffset();
     UnsafeAppendToBitmap(false);
     return Status::OK();
   }
 
   Status AppendEmptyValue() final {
-    ARROW_RETURN_NOT_OK(AppendNextOffset());
     ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppendNextOffset();
     UnsafeAppendToBitmap(true);
     return Status::OK();
   }
@@ -193,8 +193,7 @@ class BaseBinaryBuilder
         values.begin(), values.end(), 0ULL,
         [](uint64_t sum, const std::string& str) { return sum + str.size(); });
     ARROW_RETURN_NOT_OK(Reserve(values.size()));
-    ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
-    ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
+    ARROW_RETURN_NOT_OK(ReserveData(total_length));
 
     if (valid_bytes != NULLPTR) {
       for (std::size_t i = 0; i < values.size(); ++i) {
@@ -288,13 +287,16 @@ class BaseBinaryBuilder
     auto bitmap = array.GetValues<uint8_t>(0, 0);
     auto offsets = array.GetValues<offset_type>(1);
     auto data = array.GetValues<uint8_t>(2, 0);
+    auto total_length = offsets[offset + length] - offsets[offset];
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    ARROW_RETURN_NOT_OK(ReserveData(total_length));
     for (int64_t i = 0; i < length; i++) {
       if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
         const offset_type start = offsets[offset + i];
         const offset_type end = offsets[offset + i + 1];
-        ARROW_RETURN_NOT_OK(Append(data + start, end - start));
+        UnsafeAppend(data + start, end - start);
       } else {
-        ARROW_RETURN_NOT_OK(AppendNull());
+        UnsafeAppendNull();
       }
     }
     return Status::OK();
diff --git a/include/arrow/array/builder_nested.h b/include/arrow/array/builder_nested.h
index 74f7c04..d0b17c2 100644
--- a/include/arrow/array/builder_nested.h
+++ b/include/arrow/array/builder_nested.h
@@ -63,7 +63,7 @@ class BaseListBuilder : public ArrayBuilder {
       : BaseListBuilder(pool, value_builder, list(value_builder->type()), alignment) {}
 
   Status Resize(int64_t capacity) override {
-    if (capacity > maximum_elements()) {
+    if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) {
       return Status::CapacityError("List array cannot reserve space for more than ",
                                    maximum_elements(), " got ", capacity);
     }
@@ -99,14 +99,14 @@ class BaseListBuilder : public ArrayBuilder {
   Status Append(bool is_valid = true) {
     ARROW_RETURN_NOT_OK(Reserve(1));
     UnsafeAppendToBitmap(is_valid);
-    return AppendNextOffset();
+    UnsafeAppendNextOffset();
+    return Status::OK();
   }
 
   Status AppendNull() final { return Append(false); }
 
   Status AppendNulls(int64_t length) final {
     ARROW_RETURN_NOT_OK(Reserve(length));
-    ARROW_RETURN_NOT_OK(ValidateOverflow(0));
     UnsafeAppendToBitmap(length, false);
     const int64_t num_values = value_builder_->length();
     for (int64_t i = 0; i < length; ++i) {
@@ -119,7 +119,6 @@ class BaseListBuilder : public ArrayBuilder {
 
   Status AppendEmptyValues(int64_t length) final {
     ARROW_RETURN_NOT_OK(Reserve(length));
-    ARROW_RETURN_NOT_OK(ValidateOverflow(0));
     UnsafeAppendToBitmap(length, true);
     const int64_t num_values = value_builder_->length();
     for (int64_t i = 0; i < length; ++i) {
@@ -133,17 +132,17 @@ class BaseListBuilder : public ArrayBuilder {
     const offset_type* offsets = array.GetValues<offset_type>(1);
     const bool all_valid = !array.MayHaveLogicalNulls();
     const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
+    ARROW_RETURN_NOT_OK(Reserve(length));
     for (int64_t row = offset; row < offset + length; row++) {
       const bool is_valid =
           all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) ||
           array.IsValid(row);
+      UnsafeAppendToBitmap(is_valid);
+      UnsafeAppendNextOffset();
       if (is_valid) {
-        ARROW_RETURN_NOT_OK(Append());
         int64_t slot_length = offsets[row + 1] - offsets[row];
         ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0],
                                                              offsets[row], slot_length));
-      } else {
-        ARROW_RETURN_NOT_OK(AppendNull());
       }
     }
     return Status::OK();
@@ -202,6 +201,11 @@ class BaseListBuilder : public ArrayBuilder {
     const int64_t num_values = value_builder_->length();
     return offsets_builder_.Append(static_cast<offset_type>(num_values));
   }
+
+  void UnsafeAppendNextOffset() {
+    const int64_t num_values = value_builder_->length();
+    offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
+  }
 };
 
 /// \class ListBuilder
diff --git a/include/arrow/array/builder_run_end.h b/include/arrow/array/builder_run_end.h
index 9764c57..ac92efb 100644
--- a/include/arrow/array/builder_run_end.h
+++ b/include/arrow/array/builder_run_end.h
@@ -273,7 +273,7 @@ class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {
 
   // Pre-condition: !value_run_builder_.has_open_run()
   template <typename RunEndCType>
-  Status DoAppendArray(const ArraySpan& to_append);
+  Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length);
 
   template <typename RunEndCType>
   Status DoAppendRunEnd(int64_t run_end);
diff --git a/include/arrow/array/data.h b/include/arrow/array/data.h
index 27a30ad..82a6e73 100644
--- a/include/arrow/array/data.h
+++ b/include/arrow/array/data.h
@@ -527,7 +527,8 @@ struct ARROW_EXPORT ArraySpan {
   int64_t ComputeLogicalNullCount() const;
 
  private:
-  friend bool internal::IsNullRunEndEncoded(const ArrayData& span, int64_t i);
+  ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& span,
+                                                                int64_t i);
 
   bool IsNullSparseUnion(int64_t i) const;
   bool IsNullDenseUnion(int64_t i) const;
diff --git a/include/arrow/array/util.h b/include/arrow/array/util.h
index 6e6c61b..9f34af0 100644
--- a/include/arrow/array/util.h
+++ b/include/arrow/array/util.h
@@ -73,10 +73,11 @@ namespace internal {
 /// are not swapped by this function and should be handled separately.
 ///
 /// \param[in] data the array contents
+/// \param[in] pool the memory pool to allocate memory from
 /// \return the resulting ArrayData whose elements were swapped
 ARROW_EXPORT
 Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
-    const std::shared_ptr<ArrayData>& data);
+    const std::shared_ptr<ArrayData>& data, MemoryPool* pool = default_memory_pool());
 
 /// Given a number of ArrayVectors, treat each ArrayVector as the
 /// chunks of a chunked array.  Then rechunk each ArrayVector such that
diff --git a/include/arrow/buffer.h b/include/arrow/buffer.h
index 9270c4d..65f1abd 100644
--- a/include/arrow/buffer.h
+++ b/include/arrow/buffer.h
@@ -63,7 +63,11 @@ class ARROW_EXPORT Buffer {
 
   Buffer(const uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm,
          std::shared_ptr<Buffer> parent = NULLPTR)
-      : is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(parent) {
+      : is_mutable_(false),
+        data_(data),
+        size_(size),
+        capacity_(size),
+        parent_(std::move(parent)) {
     SetMemoryManager(std::move(mm));
   }
 
diff --git a/include/arrow/c/abi.h b/include/arrow/c/abi.h
index d58417e..6abe866 100644
--- a/include/arrow/c/abi.h
+++ b/include/arrow/c/abi.h
@@ -15,10 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
+/// \file abi.h Arrow C Data Interface
+///
+/// The Arrow C Data interface defines a very small, stable set
+/// of C definitions which can be easily copied into any project's
+/// source code and vendored to be used for columnar data interchange
+/// in the Arrow format. For non-C/C++ languages and runtimes,
+/// it should be almost as easy to translate the C definitions into
+/// the corresponding C FFI declarations.
+///
+/// Applications and libraries can therefore work with Arrow memory
+/// without necessarily using the Arrow libraries or reinventing
+/// the wheel. Developers can choose between tight integration
+/// with the Arrow software project or minimal integration with
+/// the Arrow format only.
+
 #pragma once
 
 #include <stdint.h>
 
+// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -65,6 +82,61 @@ struct ArrowArray {
 
 #endif  // ARROW_C_DATA_INTERFACE
 
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html
+
+// DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+// CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+// CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+// Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+// OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+// Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+// Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+// Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+// ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+// Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+// Reserved for extension
+#define ARROW_DEVICE_EXT_DEV 12
+// CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+// unified shared memory allocated on a oneAPI non-partitioned device.
+#define ARROW_DEVICE_ONEAPI 14
+// GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+// Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+struct ArrowDeviceArray {
+  // the Allocated Array
+  //
+  // the buffers in the array (along with the buffers of any
+  // children) are what is allocated on the device.
+  struct ArrowArray array;
+  // The device id to identify a specific device
+  int64_t device_id;
+  // The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  // An event-like object to synchronize on if needed.
+  void* sync_event;
+  // Reserved bytes for future expansion.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
 #ifndef ARROW_C_STREAM_INTERFACE
 #define ARROW_C_STREAM_INTERFACE
 
@@ -106,6 +178,56 @@ struct ArrowArrayStream {
 
 #endif  // ARROW_C_STREAM_INTERFACE
 
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+//
+// This stream is intended to provide a stream of data on a single
+// device, if a producer wants data to be produced on multiple devices
+// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  // The device that this stream produces data on.
+  ArrowDeviceType device_type;
+
+  // Callback to get the stream schema
+  // (will be the same for all arrays in the stream).
+  //
+  // Return value 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // If successful, the ArrowSchema must be released independently from the stream.
+  // The schema should be accessible via CPU memory.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  // Callback to get the next array
+  // (if no error and the array is released, the stream has ended)
+  //
+  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // If successful, the ArrowDeviceArray must be released independently from the stream.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  // Callback to get optional detailed error information.
+  // This must only be called if the last stream operation failed
+  // with a non-0 return code.
+  //
+  // Return value: pointer to a null-terminated character array describing
+  // the last error, or NULL if no description is available.
+  //
+  // The returned pointer is only valid until the next operation on this stream
+  // (including release).
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  // Release callback: release the stream's own resources.
+  // Note that arrays returned by `get_next` must be individually released.
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/arrow/c/helpers.h b/include/arrow/c/helpers.h
index a5c1f6f..a24f272 100644
--- a/include/arrow/c/helpers.h
+++ b/include/arrow/c/helpers.h
@@ -17,11 +17,20 @@
 
 #pragma once
 
-#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include "arrow/c/abi.h"
 
+#define ARROW_C_ASSERT(condition, msg)                          \
+  do {                                                          \
+    if (!(condition)) {                                         \
+      fprintf(stderr, "%s:%d:: %s", __FILE__, __LINE__, (msg)); \
+      abort();                                                  \
+    }                                                           \
+  } while (0)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -51,7 +60,8 @@ inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dest) {
 inline void ArrowSchemaRelease(struct ArrowSchema* schema) {
   if (!ArrowSchemaIsReleased(schema)) {
     schema->release(schema);
-    assert(ArrowSchemaIsReleased(schema));
+    ARROW_C_ASSERT(ArrowSchemaIsReleased(schema),
+                   "ArrowSchemaRelease did not cleanup release callback");
   }
 }
 
@@ -78,7 +88,8 @@ inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) {
 inline void ArrowArrayRelease(struct ArrowArray* array) {
   if (!ArrowArrayIsReleased(array)) {
     array->release(array);
-    assert(ArrowArrayIsReleased(array));
+    ARROW_C_ASSERT(ArrowArrayIsReleased(array),
+                   "ArrowArrayRelease did not cleanup release callback");
   }
 }
 
@@ -108,7 +119,8 @@ inline void ArrowArrayStreamMove(struct ArrowArrayStream* src,
 inline void ArrowArrayStreamRelease(struct ArrowArrayStream* stream) {
   if (!ArrowArrayStreamIsReleased(stream)) {
     stream->release(stream);
-    assert(ArrowArrayStreamIsReleased(stream));
+    ARROW_C_ASSERT(ArrowArrayStreamIsReleased(stream),
+                   "ArrowArrayStreamRelease did not cleanup release callback");
   }
 }
 
diff --git a/include/arrow/compute/api.h b/include/arrow/compute/api.h
index 7e7ec36..5b5dfdf 100644
--- a/include/arrow/compute/api.h
+++ b/include/arrow/compute/api.h
@@ -33,11 +33,6 @@
 #include "arrow/compute/registry.h"       // IWYU pragma: export
 #include "arrow/datum.h"                  // IWYU pragma: export
 
-/// \defgroup execnode-expressions Utilities for creating expressions to
-/// use in execution plans
-/// @{
-/// @}
-
 #include "arrow/compute/expression.h"  // IWYU pragma: export
 
 /// \defgroup execnode-row Utilities for working with data in a row-major format
@@ -46,7 +41,7 @@
 
 #include "arrow/compute/row/grouper.h"  // IWYU pragma: export
 
-/// \defgroup execnode-components Components associated with ExecBatch
+/// \defgroup acero-internals Acero internals, useful for those extending Acero
 /// @{
 /// @}
 
diff --git a/include/arrow/compute/api_aggregate.h b/include/arrow/compute/api_aggregate.h
index 97c6542..8f45f61 100644
--- a/include/arrow/compute/api_aggregate.h
+++ b/include/arrow/compute/api_aggregate.h
@@ -284,6 +284,36 @@ Result<Datum> Sum(
     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
     ExecContext* ctx = NULLPTR);
 
+/// \brief Calculate the first value of an array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed first as Scalar
+///
+/// \since 13.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> First(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the last value of an array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed last as a Scalar
+///
+/// \since 13.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Last(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
 /// \brief Calculate the min / max of a numeric array
 ///
 /// This function returns both the min and max as a struct scalar, with type
diff --git a/include/arrow/compute/api_vector.h b/include/arrow/compute/api_vector.h
index 2ec1cf9..c85db1a 100644
--- a/include/arrow/compute/api_vector.h
+++ b/include/arrow/compute/api_vector.h
@@ -210,25 +210,39 @@ class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
   NullPlacement null_placement;
 };
 
-/// \brief Options for cumulative sum function
-class ARROW_EXPORT CumulativeSumOptions : public FunctionOptions {
+/// \brief Options for cumulative functions
+/// \note Also aliased as CumulativeSumOptions for backward compatibility
+class ARROW_EXPORT CumulativeOptions : public FunctionOptions {
  public:
-  explicit CumulativeSumOptions(double start = 0, bool skip_nulls = false,
-                                bool check_overflow = false);
-  explicit CumulativeSumOptions(std::shared_ptr<Scalar> start, bool skip_nulls = false,
-                                bool check_overflow = false);
-  static constexpr char const kTypeName[] = "CumulativeSumOptions";
-  static CumulativeSumOptions Defaults() { return CumulativeSumOptions(); }
-
-  /// Optional starting value for cumulative operation computation
-  std::shared_ptr<Scalar> start;
+  explicit CumulativeOptions(bool skip_nulls = false);
+  explicit CumulativeOptions(double start, bool skip_nulls = false);
+  explicit CumulativeOptions(std::shared_ptr<Scalar> start, bool skip_nulls = false);
+  static constexpr char const kTypeName[] = "CumulativeOptions";
+  static CumulativeOptions Defaults() { return CumulativeOptions(); }
+
+  /// Optional starting value for cumulative operation computation, default depends on the
+  /// operation and input type.
+  /// - sum: 0
+  /// - prod: 1
+  /// - min: maximum of the input type
+  /// - max: minimum of the input type
+  std::optional<std::shared_ptr<Scalar>> start;
 
   /// If true, nulls in the input are ignored and produce a corresponding null output.
   /// When false, the first null encountered is propagated through the remaining output.
   bool skip_nulls = false;
+};
+using CumulativeSumOptions = CumulativeOptions;  // For backward compatibility
+
+/// \brief Options for pairwise functions
+class ARROW_EXPORT PairwiseOptions : public FunctionOptions {
+ public:
+  explicit PairwiseOptions(int64_t periods = 1);
+  static constexpr char const kTypeName[] = "PairwiseOptions";
+  static PairwiseOptions Defaults() { return PairwiseOptions(); }
 
-  /// When true, returns an Invalid Status when overflow is detected
-  bool check_overflow = false;
+  /// Periods to shift for applying the binary operation, accepts negative values.
+  int64_t periods = 1;
 };
 
 /// @}
@@ -259,12 +273,18 @@ namespace internal {
 // These internal functions are implemented in kernels/vector_selection.cc
 
 /// \brief Return the number of selected indices in the boolean filter
+///
+/// \param filter a plain or run-end encoded boolean array with or without nulls
+/// \param null_selection how to handle nulls in the filter
 ARROW_EXPORT
 int64_t GetFilterOutputSize(const ArraySpan& filter,
                             FilterOptions::NullSelectionBehavior null_selection);
 
 /// \brief Compute uint64 selection indices for use with Take given a boolean
 /// filter
+///
+/// \param filter a plain or run-end encoded boolean array with or without nulls
+/// \param null_selection how to handle nulls in the filter
 ARROW_EXPORT
 Result<std::shared_ptr<ArrayData>> GetTakeIndices(
     const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
@@ -597,12 +617,72 @@ Result<Datum> RunEndEncode(
 ARROW_EXPORT
 Result<Datum> RunEndDecode(const Datum& value, ExecContext* ctx = NULLPTR);
 
+/// \brief Compute the cumulative sum of an array-like object
+///
+/// \param[in] values array-like input
+/// \param[in] options configures cumulative sum behavior
+/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
+/// status on overflow, otherwise wrap around on overflow
+/// \param[in] ctx the function execution context, optional
 ARROW_EXPORT
 Result<Datum> CumulativeSum(
-    const Datum& values,
-    const CumulativeSumOptions& options = CumulativeSumOptions::Defaults(),
+    const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
+    bool check_overflow = false, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cumulative product of an array-like object
+///
+/// \param[in] values array-like input
+/// \param[in] options configures cumulative prod behavior
+/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
+/// status on overflow, otherwise wrap around on overflow
+/// \param[in] ctx the function execution context, optional
+ARROW_EXPORT
+Result<Datum> CumulativeProd(
+    const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
+    bool check_overflow = false, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cumulative max of an array-like object
+///
+/// \param[in] values array-like input
+/// \param[in] options configures cumulative max behavior
+/// \param[in] ctx the function execution context, optional
+ARROW_EXPORT
+Result<Datum> CumulativeMax(
+    const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cumulative min of an array-like object
+///
+/// \param[in] values array-like input
+/// \param[in] options configures cumulative min behavior
+/// \param[in] ctx the function execution context, optional
+ARROW_EXPORT
+Result<Datum> CumulativeMin(
+    const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
     ExecContext* ctx = NULLPTR);
 
+/// \brief Return the first order difference of an array.
+///
+/// Computes the first order difference of an array, i.e.
+///   output[i] = input[i] - input[i - p]  if i >= p
+///   output[i] = null                     otherwise
+/// where p is the period. For example, with p = 1,
+///   Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5].
+/// With p = 2,
+///   Diff([1, 4, 9, 10, 15]) = [null, null, 8, 6, 6]
+/// p can also be negative, in which case the diff is computed in
+/// the opposite direction.
+/// \param[in] array array input
+/// \param[in] options options, specifying overflow behavior and period
+/// \param[in] check_overflow whether to return error on overflow
+/// \param[in] ctx the function execution context, optional
+/// \return result as array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> PairwiseDiff(const Array& array,
+                                            const PairwiseOptions& options,
+                                            bool check_overflow = false,
+                                            ExecContext* ctx = NULLPTR);
+
 // ----------------------------------------------------------------------
 // Deprecated functions
 
diff --git a/include/arrow/compute/exec.h b/include/arrow/compute/exec.h
index d583f0e..3fbefe4 100644
--- a/include/arrow/compute/exec.h
+++ b/include/arrow/compute/exec.h
@@ -168,7 +168,7 @@ constexpr int64_t kUnsequencedIndex = -1;
 /// than is desirable for this class. Microbenchmarks would help determine for
 /// sure. See ARROW-8928.
 
-/// \addtogroup execnode-components
+/// \addtogroup acero-internals
 /// @{
 
 struct ARROW_EXPORT ExecBatch {
@@ -265,6 +265,13 @@ inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equal
 
 ARROW_EXPORT void PrintTo(const ExecBatch&, std::ostream*);
 
+/// @}
+
+/// \defgroup compute-internals Utilities for calling functions, useful for those
+/// extending the function registry
+///
+/// @{
+
 struct ExecValue {
   ArraySpan array = {};
   const Scalar* scalar = NULLPTR;
@@ -349,6 +356,9 @@ struct ARROW_EXPORT ExecResult {
   const std::shared_ptr<ArrayData>& array_data() const {
     return std::get<std::shared_ptr<ArrayData>>(this->value);
   }
+  ArrayData* array_data_mutable() {
+    return std::get<std::shared_ptr<ArrayData>>(this->value).get();
+  }
 
   bool is_array_data() const { return this->value.index() == 1; }
 };
@@ -414,8 +424,6 @@ struct ARROW_EXPORT ExecSpan {
   std::vector<ExecValue> values;
 };
 
-/// @}
-
 /// \defgroup compute-call-function One-shot calls to compute functions
 ///
 /// @{
diff --git a/include/arrow/compute/expression.h b/include/arrow/compute/expression.h
index c9c7b0e..9a36a6d 100644
--- a/include/arrow/compute/expression.h
+++ b/include/arrow/compute/expression.h
@@ -33,7 +33,7 @@
 namespace arrow {
 namespace compute {
 
-/// \defgroup expression-core Expressions to describe transformations in execution plans
+/// \defgroup expression-core Expressions to describe data transformations
 ///
 /// @{
 
@@ -94,7 +94,11 @@ class ARROW_EXPORT Expression {
   bool IsNullLiteral() const;
 
   /// Return true if this expression could evaluate to true. Will return true for any
-  /// unbound, non-boolean, or unsimplified Expressions
+  /// unbound or non-boolean Expressions. IsSatisfiable does not (currently) do any
+  /// canonicalization or simplification of the expression, so even Expressions
+  /// which are unsatisfiable may spuriously return `true` here. This function is
+  /// intended for use in predicate pushdown where a filter expression is simplified
+  /// by a guarantee, so it assumes that trying to simplify again would be redundant.
   bool IsSatisfiable() const;
 
   // XXX someday
@@ -256,7 +260,7 @@ Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
 ARROW_EXPORT
 Result<Expression> Deserialize(std::shared_ptr<Buffer>);
 
-/// \defgroup expression-convenience Functions convenient expression creation
+/// \defgroup expression-convenience Helpers for convenient expression creation
 ///
 /// @{
 
diff --git a/include/arrow/compute/kernel.h b/include/arrow/compute/kernel.h
index a642130..5b5b571 100644
--- a/include/arrow/compute/kernel.h
+++ b/include/arrow/compute/kernel.h
@@ -152,6 +152,12 @@ ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndInteger();
 ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
     std::shared_ptr<TypeMatcher> value_type_matcher);
 
+/// \brief Match run-end encoded types that use any valid run-end type and
+/// encode specific value types
+///
+/// @param[in] value_type_id a type id that the type of the values field should match
+ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(Type::type value_type_id);
+
 /// \brief Match run-end encoded types that encode specific run-end and value types
 ///
 /// @param[in] run_end_type_matcher a matcher that is applied to the run_ends field
@@ -277,14 +283,16 @@ class ARROW_EXPORT OutputType {
   ///
   /// This function SHOULD _not_ be used to check for arity, that is to be
   /// performed one or more layers above.
-  using Resolver = Result<TypeHolder> (*)(KernelContext*, const std::vector<TypeHolder>&);
+  using Resolver =
+      std::function<Result<TypeHolder>(KernelContext*, const std::vector<TypeHolder>&)>;
 
   /// \brief Output an exact type
   OutputType(std::shared_ptr<DataType> type)  // NOLINT implicit construction
       : kind_(FIXED), type_(std::move(type)) {}
 
   /// \brief Output a computed type depending on actual input types
-  OutputType(Resolver resolver)  // NOLINT implicit construction
+  template <typename Fn>
+  OutputType(Fn resolver)  // NOLINT implicit construction
       : kind_(COMPUTED), resolver_(std::move(resolver)) {}
 
   OutputType(const OutputType& other) {
@@ -644,22 +652,22 @@ using ScalarAggregateFinalize = Status (*)(KernelContext*, Datum*);
 /// * finalize: produces the end result of the aggregation using the
 ///   KernelState in the KernelContext.
 struct ARROW_EXPORT ScalarAggregateKernel : public Kernel {
-  ScalarAggregateKernel() = default;
-
   ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
                         ScalarAggregateConsume consume, ScalarAggregateMerge merge,
-                        ScalarAggregateFinalize finalize)
+                        ScalarAggregateFinalize finalize, const bool ordered)
       : Kernel(std::move(sig), std::move(init)),
         consume(consume),
         merge(merge),
-        finalize(finalize) {}
+        finalize(finalize),
+        ordered(ordered) {}
 
   ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
                         KernelInit init, ScalarAggregateConsume consume,
-                        ScalarAggregateMerge merge, ScalarAggregateFinalize finalize)
+                        ScalarAggregateMerge merge, ScalarAggregateFinalize finalize,
+                        const bool ordered)
       : ScalarAggregateKernel(
             KernelSignature::Make(std::move(in_types), std::move(out_type)),
-            std::move(init), consume, merge, finalize) {}
+            std::move(init), consume, merge, finalize, ordered) {}
 
   /// \brief Merge a vector of KernelStates into a single KernelState.
   /// The merged state will be returned and will be set on the KernelContext.
@@ -670,6 +678,14 @@ struct ARROW_EXPORT ScalarAggregateKernel : public Kernel {
   ScalarAggregateConsume consume;
   ScalarAggregateMerge merge;
   ScalarAggregateFinalize finalize;
+  /// \brief Whether this kernel requires ordering
+  /// Some aggregations, such as, "first", requires some kind of input order. The
+  /// order can be implicit, e.g., the order of the input data, or explicit, e.g.
+  /// the ordering specified with a window aggregation.
+  /// The caller of the aggregate kernel is responsible for passing data in some
+  /// defined order to the kernel. The flag here is a way for the kernel to tell
+  /// the caller that data passed to the kernel must be defined in some order.
+  bool ordered = false;
 };
 
 // ----------------------------------------------------------------------
@@ -699,25 +715,31 @@ struct ARROW_EXPORT HashAggregateKernel : public Kernel {
 
   HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
                       HashAggregateResize resize, HashAggregateConsume consume,
-                      HashAggregateMerge merge, HashAggregateFinalize finalize)
+                      HashAggregateMerge merge, HashAggregateFinalize finalize,
+                      const bool ordered)
       : Kernel(std::move(sig), std::move(init)),
         resize(resize),
         consume(consume),
         merge(merge),
-        finalize(finalize) {}
+        finalize(finalize),
+        ordered(ordered) {}
 
   HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
                       KernelInit init, HashAggregateConsume consume,
                       HashAggregateResize resize, HashAggregateMerge merge,
-                      HashAggregateFinalize finalize)
+                      HashAggregateFinalize finalize, const bool ordered)
       : HashAggregateKernel(
             KernelSignature::Make(std::move(in_types), std::move(out_type)),
-            std::move(init), resize, consume, merge, finalize) {}
+            std::move(init), resize, consume, merge, finalize, ordered) {}
 
   HashAggregateResize resize;
   HashAggregateConsume consume;
   HashAggregateMerge merge;
   HashAggregateFinalize finalize;
+  /// @brief whether the summarizer requires ordering
+  /// This is similar to ScalarAggregateKernel. See ScalarAggregateKernel
+  /// for detailed doc of this variable.
+  bool ordered = false;
 };
 
 }  // namespace compute
diff --git a/include/arrow/compute/key_map.h b/include/arrow/compute/key_map.h
index 5e40b3d..7ab4847 100644
--- a/include/arrow/compute/key_map.h
+++ b/include/arrow/compute/key_map.h
@@ -17,13 +17,13 @@
 
 #pragma once
 
+#include <cassert>
 #include <functional>
 
 #include "arrow/compute/util.h"
-#include "arrow/compute/util_internal.h"
-#include "arrow/memory_pool.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
+#include "arrow/type_fwd.h"
 
 namespace arrow {
 namespace compute {
@@ -80,9 +80,11 @@ class ARROW_EXPORT SwissTable {
 
   void num_inserted(uint32_t i) { num_inserted_ = i; }
 
-  uint8_t* blocks() const { return blocks_; }
+  uint8_t* blocks() const { return blocks_->mutable_data(); }
 
-  uint32_t* hashes() const { return hashes_; }
+  uint32_t* hashes() const {
+    return reinterpret_cast<uint32_t*>(hashes_->mutable_data());
+  }
 
   /// \brief Extract group id for a given slot in a given block.
   ///
@@ -226,12 +228,12 @@ class ARROW_EXPORT SwissTable {
   // ---------------------------------------------------
   // * Empty bucket has value 0x80. Non-empty bucket has highest bit set to 0.
   //
-  uint8_t* blocks_;
+  std::shared_ptr<Buffer> blocks_;
 
   // Array of hashes of values inserted into slots.
   // Undefined if the corresponding slot is empty.
   // There is 64B padding at the end.
-  uint32_t* hashes_;
+  std::shared_ptr<Buffer> hashes_;
 
   int64_t hardware_flags_;
   MemoryPool* pool_;
@@ -243,8 +245,8 @@ uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
   // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
   // that case we can extract group id using aligned 64-bit word access.
   int num_group_id_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
-  ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 ||
-               num_group_id_bits == 32 || num_group_id_bits == 64);
+  assert(num_group_id_bits == 8 || num_group_id_bits == 16 || num_group_id_bits == 32 ||
+         num_group_id_bits == 64);
 
   int bit_offset = slot * num_group_id_bits;
   const uint64_t* group_id_bytes =
@@ -260,8 +262,8 @@ void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash,
 
   // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
   // In that case we can insert group id value using aligned 64-bit word access.
-  ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
-               num_groupid_bits == 32 || num_groupid_bits == 64);
+  assert(num_groupid_bits == 8 || num_groupid_bits == 16 || num_groupid_bits == 32 ||
+         num_groupid_bits == 64);
 
   const uint64_t num_block_bytes = (8 + num_groupid_bits);
   constexpr uint64_t stamp_mask = 0x7f;
@@ -270,13 +272,13 @@ void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash,
   int stamp =
       static_cast<int>((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask);
   uint64_t block_id = slot_id >> 3;
-  uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
+  uint8_t* blockbase = blocks_->mutable_data() + num_block_bytes * block_id;
 
   blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
   int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
 
   // Block status bytes should start at an address aligned to 8 bytes
-  ARROW_DCHECK((reinterpret_cast<uint64_t>(blockbase) & 7) == 0);
+  assert((reinterpret_cast<uint64_t>(blockbase) & 7) == 0);
   uint64_t* ptr = reinterpret_cast<uint64_t*>(blockbase) + 1 + (groupid_bit_offset >> 6);
   *ptr |= (static_cast<uint64_t>(group_id) << (groupid_bit_offset & 63));
 }
diff --git a/include/arrow/compute/registry.h b/include/arrow/compute/registry.h
index a7eb4bc..afd9f20 100644
--- a/include/arrow/compute/registry.h
+++ b/include/arrow/compute/registry.h
@@ -55,7 +55,7 @@ class ARROW_EXPORT FunctionRegistry {
   /// \brief Construct a new nested registry with the given parent.
   ///
   /// Most users only need to use the global registry. The returned registry never changes
-  /// its parent, even when an operation allows overwritting.
+  /// its parent, even when an operation allows overwriting.
   static std::unique_ptr<FunctionRegistry> Make(FunctionRegistry* parent);
 
   /// \brief Check whether a new function can be added to the registry.
diff --git a/include/arrow/compute/util.h b/include/arrow/compute/util.h
index 6e1bb79..489139e 100644
--- a/include/arrow/compute/util.h
+++ b/include/arrow/compute/util.h
@@ -139,69 +139,55 @@ class TempVectorHolder {
   uint32_t num_elements_;
 };
 
-class ARROW_EXPORT bit_util {
- public:
-  static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
-                              const int num_bits, const uint8_t* bits, int* num_indexes,
-                              uint16_t* indexes, int bit_offset = 0);
+namespace bit_util {
 
-  static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
                                   const int num_bits, const uint8_t* bits,
-                                  const uint16_t* input_indexes, int* num_indexes,
-                                  uint16_t* indexes, int bit_offset = 0);
+                                  int* num_indexes, uint16_t* indexes,
+                                  int bit_offset = 0);
 
-  // Input and output indexes may be pointing to the same data (in-place filtering).
-  static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
-                                 const uint8_t* bits, int* num_indexes_bit0,
-                                 uint16_t* indexes_bit0, uint16_t* indexes_bit1,
-                                 int bit_offset = 0);
+ARROW_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+                                      const int num_bits, const uint8_t* bits,
+                                      const uint16_t* input_indexes, int* num_indexes,
+                                      uint16_t* indexes, int bit_offset = 0);
 
-  // Bit 1 is replaced with byte 0xFF.
-  static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
-                            const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
+// Input and output indexes may be pointing to the same data (in-place filtering).
+ARROW_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits,
+                                     const uint8_t* bits, int* num_indexes_bit0,
+                                     uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+                                     int bit_offset = 0);
 
-  // Return highest bit of each byte.
-  static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
-                            const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
+// Bit 1 is replaced with byte 0xFF.
+ARROW_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits,
+                                const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
 
-  static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
-                                 uint32_t num_bytes);
+// Return highest bit of each byte.
+ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits,
+                                const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
 
- private:
-  inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes);
-  inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value);
-  inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
-                                            int* num_indexes, uint16_t* indexes);
-  inline static void bits_filter_indexes_helper(uint64_t word,
-                                                const uint16_t* input_indexes,
-                                                int* num_indexes, uint16_t* indexes);
-  template <int bit_to_search, bool filter_input_indexes>
-  static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
-                                       const uint8_t* bits, const uint16_t* input_indexes,
-                                       int* num_indexes, uint16_t* indexes,
-                                       uint16_t base_index = 0);
+ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+                                     uint32_t num_bytes);
 
 #if defined(ARROW_HAVE_AVX2)
-  static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
-                                   const uint8_t* bits, int* num_indexes,
-                                   uint16_t* indexes, uint16_t base_index = 0);
-  static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
-                                       const uint8_t* bits, const uint16_t* input_indexes,
-                                       int* num_indexes, uint16_t* indexes);
-  template <int bit_to_search>
-  static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
-                                       int* num_indexes, uint16_t* indexes,
-                                       uint16_t base_index = 0);
-  template <int bit_to_search>
-  static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+
+namespace avx2 {
+ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
+                                           const uint8_t* bits,
                                            const uint16_t* input_indexes,
                                            int* num_indexes, uint16_t* indexes);
-  static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
-  static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
-  static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
+ARROW_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
+                                       const uint8_t* bits, int* num_indexes,
+                                       uint16_t* indexes, uint16_t base_index = 0);
+ARROW_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits,
+                                     uint8_t* bytes);
+ARROW_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes,
+                                     uint8_t* bits);
+ARROW_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
+}  // namespace avx2
+
 #endif
-};
 
+}  // namespace bit_util
 }  // namespace util
 
 namespace compute {
diff --git a/include/arrow/dataset/api.h b/include/arrow/dataset/api.h
index 6554dfc..c2ebd9d 100644
--- a/include/arrow/dataset/api.h
+++ b/include/arrow/dataset/api.h
@@ -26,6 +26,9 @@
 #ifdef ARROW_CSV
 #include "arrow/dataset/file_csv.h"
 #endif
+#ifdef ARROW_JSON
+#include "arrow/dataset/file_json.h"
+#endif
 #include "arrow/dataset/file_ipc.h"
 #ifdef ARROW_ORC
 #include "arrow/dataset/file_orc.h"
diff --git a/include/arrow/dataset/partition.h b/include/arrow/dataset/partition.h
index b122047..315a3d3 100644
--- a/include/arrow/dataset/partition.h
+++ b/include/arrow/dataset/partition.h
@@ -187,6 +187,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
 
   const ArrayVector& dictionaries() const { return dictionaries_; }
 
+  SegmentEncoding segment_encoding() const { return options_.segment_encoding; }
+
   bool Equals(const Partitioning& other) const override;
 
  protected:
diff --git a/include/arrow/dataset/type_fwd.h b/include/arrow/dataset/type_fwd.h
index a7ea8d6..d58781e 100644
--- a/include/arrow/dataset/type_fwd.h
+++ b/include/arrow/dataset/type_fwd.h
@@ -72,6 +72,11 @@ class CsvFileWriter;
 class CsvFileWriteOptions;
 struct CsvFragmentScanOptions;
 
+class JsonFileFormat;
+class JsonFileWriter;
+class JsonFileWriteOptions;
+struct JsonFragmentScanOptions;
+
 class IpcFileFormat;
 class IpcFileWriter;
 class IpcFileWriteOptions;
diff --git a/include/arrow/datum.h b/include/arrow/datum.h
index 1d6d87a..57ae373 100644
--- a/include/arrow/datum.h
+++ b/include/arrow/datum.h
@@ -43,14 +43,19 @@ class Table;
 /// \class Datum
 /// \brief Variant type for various Arrow C++ data structures
 struct ARROW_EXPORT Datum {
+  /// \brief The kind of datum stored
   enum Kind { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE };
 
+  /// \brief A placeholder type to represent empty datum
   struct Empty {};
 
-  // Datums variants may have a length. This special value indicate that the
-  // current variant does not have a length.
+  /// \brief Datums variants may have a length. This special value indicate that the
+  /// current variant does not have a length.
   static constexpr int64_t kUnknownLength = -1;
 
+  /// \brief Storage of the actual datum.
+  ///
+  /// Note: For arrays, ArrayData is stored instead of Array for easier processing
   std::variant<Empty, std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
                std::shared_ptr<ChunkedArray>, std::shared_ptr<RecordBatch>,
                std::shared_ptr<Table>>
@@ -64,28 +69,49 @@ struct ARROW_EXPORT Datum {
   Datum(Datum&& other) = default;
   Datum& operator=(Datum&& other) = default;
 
+  /// \brief Construct from a Scalar
   Datum(std::shared_ptr<Scalar> value)  // NOLINT implicit conversion
       : value(std::move(value)) {}
 
+  /// \brief Construct from an ArrayData
   Datum(std::shared_ptr<ArrayData> value)  // NOLINT implicit conversion
       : value(std::move(value)) {}
 
+  /// \brief Construct from an ArrayData
   Datum(ArrayData arg)  // NOLINT implicit conversion
       : value(std::make_shared<ArrayData>(std::move(arg))) {}
 
-  Datum(const Array& value);                   // NOLINT implicit conversion
+  /// \brief Construct from an Array
+  Datum(const Array& value);  // NOLINT implicit conversion
+
+  /// \brief Construct from an Array
   Datum(const std::shared_ptr<Array>& value);  // NOLINT implicit conversion
+
+  /// \brief Construct from a ChunkedArray
   Datum(std::shared_ptr<ChunkedArray> value);  // NOLINT implicit conversion
-  Datum(std::shared_ptr<RecordBatch> value);   // NOLINT implicit conversion
-  Datum(std::shared_ptr<Table> value);         // NOLINT implicit conversion
 
-  // Explicit constructors from const-refs. Can be expensive, prefer the
-  // shared_ptr constructors
+  /// \brief Construct from a RecordBatch
+  Datum(std::shared_ptr<RecordBatch> value);  // NOLINT implicit conversion
+
+  /// \brief Construct from a Table
+  Datum(std::shared_ptr<Table> value);  // NOLINT implicit conversion
+
+  /// \brief Construct from a ChunkedArray.
+  ///
+  /// This can be expensive, prefer the shared_ptr<ChunkedArray> constructor
   explicit Datum(const ChunkedArray& value);
+
+  /// \brief Construct from a RecordBatch.
+  ///
+  /// This can be expensive, prefer the shared_ptr<RecordBatch> constructor
   explicit Datum(const RecordBatch& value);
+
+  /// \brief Construct from a Table.
+  ///
+  /// This can be expensive, prefer the shared_ptr<Table> constructor
   explicit Datum(const Table& value);
 
-  // Cast from subtypes of Array or Scalar to Datum
+  /// \brief Cast from concrete subtypes of Array or Scalar to Datum
   template <typename T, bool IsArray = std::is_base_of_v<Array, T>,
             bool IsScalar = std::is_base_of_v<Scalar, T>,
             typename = enable_if_t<IsArray || IsScalar>>
@@ -93,7 +119,7 @@ struct ARROW_EXPORT Datum {
       : Datum(std::shared_ptr<typename std::conditional<IsArray, Array, Scalar>::type>(
             std::move(value))) {}
 
-  // Cast from subtypes of Array or Scalar to Datum
+  /// \brief Cast from concrete subtypes of Array or Scalar to Datum
   template <typename T, typename TV = typename std::remove_reference_t<T>,
             bool IsArray = std::is_base_of_v<Array, T>,
             bool IsScalar = std::is_base_of_v<Scalar, T>,
@@ -101,32 +127,48 @@ struct ARROW_EXPORT Datum {
   Datum(T&& value)  // NOLINT implicit conversion
       : Datum(std::make_shared<TV>(std::forward<T>(value))) {}
 
-  // Many Scalars are copyable, let that happen
+  /// \brief Copy from concrete subtypes of Scalar.
+  ///
+  /// The concrete scalar type must be copyable (not all of them are).
   template <typename T, typename = enable_if_t<std::is_base_of_v<Scalar, T>>>
   Datum(const T& value)  // NOLINT implicit conversion
       : Datum(std::make_shared<T>(value)) {}
 
   // Convenience constructors
+  /// \brief Convenience constructor storing a bool scalar.
   explicit Datum(bool value);
+  /// \brief Convenience constructor storing an int8 scalar.
   explicit Datum(int8_t value);
+  /// \brief Convenience constructor storing a uint8 scalar.
   explicit Datum(uint8_t value);
+  /// \brief Convenience constructor storing an int16 scalar.
   explicit Datum(int16_t value);
+  /// \brief Convenience constructor storing a uint16 scalar.
   explicit Datum(uint16_t value);
+  /// \brief Convenience constructor storing an int32 scalar.
   explicit Datum(int32_t value);
+  /// \brief Convenience constructor storing a uint32 scalar.
   explicit Datum(uint32_t value);
+  /// \brief Convenience constructor storing an int64 scalar.
   explicit Datum(int64_t value);
+  /// \brief Convenience constructor storing a uint64 scalar.
   explicit Datum(uint64_t value);
+  /// \brief Convenience constructor storing a float scalar.
   explicit Datum(float value);
+  /// \brief Convenience constructor storing a double scalar.
   explicit Datum(double value);
+  /// \brief Convenience constructor storing a string scalar.
   explicit Datum(std::string value);
+  /// \brief Convenience constructor storing a string scalar.
   explicit Datum(const char* value);
 
-  // Forward to convenience constructors for a DurationScalar from std::chrono::duration
+  /// \brief Convenience constructor for a DurationScalar from std::chrono::duration
   template <template <typename, typename> class StdDuration, typename Rep,
             typename Period,
             typename = decltype(DurationScalar{StdDuration<Rep, Period>{}})>
   explicit Datum(StdDuration<Rep, Period> d) : Datum{DurationScalar(d)} {}
 
+  /// \brief The kind of data stored in Datum
   Datum::Kind kind() const {
     switch (this->value.index()) {
       case 0:
@@ -146,6 +188,10 @@ struct ARROW_EXPORT Datum {
     }
   }
 
+  /// \brief Retrieve the stored array as ArrayData
+  ///
+  /// Use make_array() if an Array is desired (which is more expensive).
+  /// \throws std::bad_variant_access if the datum is not an array
   const std::shared_ptr<ArrayData>& array() const {
     return std::get<std::shared_ptr<ArrayData>>(this->value);
   }
@@ -155,49 +201,78 @@ struct ARROW_EXPORT Datum {
   /// \see arrow::util::TotalBufferSize for caveats
   int64_t TotalBufferSize() const;
 
+  /// \brief Get the stored ArrayData in mutable form
+  ///
+  /// For internal use primarily. Keep in mind a shared_ptr<Datum> may have multiple
+  /// owners.
   ArrayData* mutable_array() const { return this->array().get(); }
 
+  /// \brief Retrieve the stored array as Array
+  /// \throws std::bad_variant_access if the datum is not an array
   std::shared_ptr<Array> make_array() const;
 
+  /// \brief Retrieve the chunked array stored
+  /// \throws std::bad_variant_access if the datum is not a chunked array
   const std::shared_ptr<ChunkedArray>& chunked_array() const {
     return std::get<std::shared_ptr<ChunkedArray>>(this->value);
   }
 
+  /// \brief Retrieve the record batch stored
+  /// \throws std::bad_variant_access if the datum is not a record batch
   const std::shared_ptr<RecordBatch>& record_batch() const {
     return std::get<std::shared_ptr<RecordBatch>>(this->value);
   }
 
+  /// \brief Retrieve the table stored
+  /// \throws std::bad_variant_access if the datum is not a table
   const std::shared_ptr<Table>& table() const {
     return std::get<std::shared_ptr<Table>>(this->value);
   }
 
+  /// \brief Retrieve the scalar stored
+  /// \throws std::bad_variant_access if the datum is not a scalar
   const std::shared_ptr<Scalar>& scalar() const {
     return std::get<std::shared_ptr<Scalar>>(this->value);
   }
 
+  /// \brief Retrieve the datum as its concrete array type
+  /// \throws std::bad_variant_access if the datum is not an array
+  /// \tparam ExactType the expected array type, may cause undefined behavior if it is not
+  /// the type of the stored array
   template <typename ExactType>
   std::shared_ptr<ExactType> array_as() const {
     return internal::checked_pointer_cast<ExactType>(this->make_array());
   }
 
+  /// \brief Retrieve the datum as its concrete scalar type
+  /// \throws std::bad_variant_access if the datum is not a scalar
+  /// \tparam ExactType the expected scalar type, may cause undefined behavior if it is
+  /// not the type of the stored scalar
   template <typename ExactType>
   const ExactType& scalar_as() const {
     return internal::checked_cast<const ExactType&>(*this->scalar());
   }
 
+  /// \brief True if Datum contains an array
   bool is_array() const { return this->kind() == Datum::ARRAY; }
 
+  /// \brief True if Datum contains a chunked array
   bool is_chunked_array() const { return this->kind() == Datum::CHUNKED_ARRAY; }
 
+  /// \brief True if Datum contains an array or a chunked array
   bool is_arraylike() const {
     return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY;
   }
 
+  /// \brief True if Datum contains a scalar
   bool is_scalar() const { return this->kind() == Datum::SCALAR; }
 
   /// \brief True if Datum contains a scalar or array-like data
   bool is_value() const { return this->is_arraylike() || this->is_scalar(); }
 
+  /// \brief Return the null count.
+  ///
+  /// Only valid for scalar and array-like data.
   int64_t null_count() const;
 
   /// \brief The value type of the variant, if any
@@ -220,11 +295,13 @@ struct ARROW_EXPORT Datum {
   /// \return empty if not arraylike
   ArrayVector chunks() const;
 
+  /// \brief True if the two data are equal
   bool Equals(const Datum& other) const;
 
   bool operator==(const Datum& other) const { return Equals(other); }
   bool operator!=(const Datum& other) const { return !Equals(other); }
 
+  /// \brief Return a string representation of the kind of datum stored.
   std::string ToString() const;
 };
 
diff --git a/include/arrow/filesystem/filesystem.h b/include/arrow/filesystem/filesystem.h
index 6dc18d7..cfadaeb 100644
--- a/include/arrow/filesystem/filesystem.h
+++ b/include/arrow/filesystem/filesystem.h
@@ -171,6 +171,26 @@ class ARROW_EXPORT FileSystem : public std::enable_shared_from_this<FileSystem>
   /// may allow normalizing irregular path forms (such as Windows local paths).
   virtual Result<std::string> NormalizePath(std::string path);
 
+  /// \brief Ensure a URI (or path) is compatible with the given filesystem and return the
+  ///        path
+  ///
+  /// \param uri_string A URI representing a resource in the given filesystem.
+  ///
+  /// This method will check to ensure the given filesystem is compatible with the
+  /// URI. This can be useful when the user provides both a URI and a filesystem or
+  /// when a user provides multiple URIs that should be compatible with the same
+  /// filesystem.
+  ///
+  /// uri_string can be an absolute path instead of a URI.  In that case it will ensure
+  /// the filesystem (if supplied) is the local filesystem (or some custom filesystem that
+  /// is capable of reading local paths) and will normalize the path's file separators.
+  ///
+  /// Note, this method only checks to ensure the URI scheme is valid.  It will not detect
+  /// inconsistencies like a mismatching region or endpoint override.
+  ///
+  /// \return The path inside the filesystem that is indicated by the URI.
+  virtual Result<std::string> PathFromUri(const std::string& uri_string) const;
+
   virtual bool Equals(const FileSystem& other) const = 0;
 
   virtual bool Equals(const std::shared_ptr<FileSystem>& other) const {
@@ -336,6 +356,7 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem {
   std::shared_ptr<FileSystem> base_fs() const { return base_fs_; }
 
   Result<std::string> NormalizePath(std::string path) override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
 
   bool Equals(const FileSystem& other) const override;
 
@@ -410,6 +431,7 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem {
 
   std::string type_name() const override { return "slow"; }
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
 
   using FileSystem::GetFileInfo;
   Result<FileInfo> GetFileInfo(const std::string& path) override;
diff --git a/include/arrow/filesystem/gcsfs.h b/include/arrow/filesystem/gcsfs.h
index c3d03b5..e4a1edf 100644
--- a/include/arrow/filesystem/gcsfs.h
+++ b/include/arrow/filesystem/gcsfs.h
@@ -27,9 +27,13 @@
 
 namespace arrow {
 namespace fs {
+namespace internal {
 
 // Opaque wrapper for GCS's library credentials to avoid exposing in Arrow headers.
 struct GcsCredentialsHolder;
+
+}  // namespace internal
+
 class GcsFileSystem;
 
 /// \brief Container for GCS Credentials and information necessary to recreate them.
@@ -41,7 +45,9 @@ class ARROW_EXPORT GcsCredentials {
   TimePoint expiration() const { return expiration_; }
   const std::string& target_service_account() const { return target_service_account_; }
   const std::string& json_credentials() const { return json_credentials_; }
-  const std::shared_ptr<GcsCredentialsHolder>& holder() const { return holder_; }
+  const std::shared_ptr<internal::GcsCredentialsHolder>& holder() const {
+    return holder_;
+  }
 
  private:
   GcsCredentials() = default;
@@ -50,7 +56,7 @@ class ARROW_EXPORT GcsCredentials {
   TimePoint expiration_;
   std::string target_service_account_;
   std::string json_credentials_;
-  std::shared_ptr<GcsCredentialsHolder> holder_;
+  std::shared_ptr<internal::GcsCredentialsHolder> holder_;
   friend class GcsFileSystem;
   friend struct GcsOptions;
 };
@@ -77,6 +83,13 @@ struct ARROW_EXPORT GcsOptions {
   /// This will be ignored if non-empty metadata is passed to OpenOutputStream.
   std::shared_ptr<const KeyValueMetadata> default_metadata;
 
+  /// \brief The project to use for creating buckets.
+  ///
+  /// If not set, the library uses the GOOGLE_CLOUD_PROJECT environment
+  /// variable. Most I/O operations do not need a project id, only applications
+  /// that create new buckets need a project id.
+  std::optional<std::string> project_id;
+
   bool Equals(const GcsOptions& other) const;
 
   /// \brief Initialize with Google Default Credentials
@@ -178,6 +191,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem {
   const GcsOptions& options() const;
 
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
 
   Result<FileInfo> GetFileInfo(const std::string& path) override;
   Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
diff --git a/include/arrow/filesystem/hdfs.h b/include/arrow/filesystem/hdfs.h
index bed0ac4..798aac0 100644
--- a/include/arrow/filesystem/hdfs.h
+++ b/include/arrow/filesystem/hdfs.h
@@ -66,6 +66,7 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem {
   std::string type_name() const override { return "hdfs"; }
   HdfsOptions options() const;
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
 
   /// \cond FALSE
   using FileSystem::GetFileInfo;
diff --git a/include/arrow/filesystem/localfs.h b/include/arrow/filesystem/localfs.h
index 75eaf31..108530c 100644
--- a/include/arrow/filesystem/localfs.h
+++ b/include/arrow/filesystem/localfs.h
@@ -82,6 +82,7 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem {
   std::string type_name() const override { return "local"; }
 
   Result<std::string> NormalizePath(std::string path) override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
 
   bool Equals(const FileSystem& other) const override;
 
@@ -121,13 +122,5 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem {
   LocalFileSystemOptions options_;
 };
 
-namespace internal {
-
-// Return whether the string is detected as a local absolute path.
-ARROW_EXPORT
-bool DetectAbsolutePath(const std::string& s);
-
-}  // namespace internal
-
 }  // namespace fs
 }  // namespace arrow
diff --git a/include/arrow/filesystem/mockfs.h b/include/arrow/filesystem/mockfs.h
index e12408f..32d06e5 100644
--- a/include/arrow/filesystem/mockfs.h
+++ b/include/arrow/filesystem/mockfs.h
@@ -66,6 +66,7 @@ class ARROW_EXPORT MockFileSystem : public FileSystem {
   std::string type_name() const override { return "mock"; }
 
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
 
   // XXX It's not very practical to have to explicitly declare inheritance
   // of default overrides.
diff --git a/include/arrow/filesystem/path_util.h b/include/arrow/filesystem/path_util.h
index 059827f..b821e79 100644
--- a/include/arrow/filesystem/path_util.h
+++ b/include/arrow/filesystem/path_util.h
@@ -69,8 +69,11 @@ std::string_view RemoveLeadingSlash(std::string_view s);
 ARROW_EXPORT
 std::string EnsureTrailingSlash(std::string_view s);
 
+/// \brief remove the forward slash (if any) from the given path
+/// \param s the input path
+/// \param preserve_root if true, allow a path of just "/" to remain unchanged
 ARROW_EXPORT
-std::string_view RemoveTrailingSlash(std::string_view s);
+std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = false);
 
 ARROW_EXPORT
 Status AssertNoTrailingSlash(std::string_view s);
diff --git a/include/arrow/filesystem/s3_test_util.h b/include/arrow/filesystem/s3_test_util.h
index 17245e0..e270a6e 100644
--- a/include/arrow/filesystem/s3_test_util.h
+++ b/include/arrow/filesystem/s3_test_util.h
@@ -26,6 +26,7 @@
 #include "arrow/filesystem/s3fs.h"
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/util.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/macros.h"
 
@@ -76,6 +77,13 @@ class MinioTestEnvironment : public ::testing::Environment {
 
 class S3Environment : public ::testing::Environment {
  public:
+  // We set this environment variable to speed up tests by ensuring
+  // DefaultAWSCredentialsProviderChain does not query (inaccessible)
+  // EC2 metadata endpoint.
+  // This must be done before spawning any Minio child process to avoid any race
+  // condition accessing environment variables.
+  S3Environment() : ec2_metadata_disabled_guard_("AWS_EC2_METADATA_DISABLED", "true") {}
+
   void SetUp() override {
     // Change this to increase logging during tests
     S3GlobalOptions options;
@@ -84,6 +92,9 @@ class S3Environment : public ::testing::Environment {
   }
 
   void TearDown() override { ASSERT_OK(FinalizeS3()); }
+
+ private:
+  EnvVarGuard ec2_metadata_disabled_guard_;
 };
 
 }  // namespace fs
diff --git a/include/arrow/filesystem/s3fs.h b/include/arrow/filesystem/s3fs.h
index 2bcceca..cc870c5 100644
--- a/include/arrow/filesystem/s3fs.h
+++ b/include/arrow/filesystem/s3fs.h
@@ -247,6 +247,7 @@ class ARROW_EXPORT S3FileSystem : public FileSystem {
   std::string region() const;
 
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
 
   /// \cond FALSE
   using FileSystem::GetFileInfo;
@@ -333,15 +334,17 @@ struct ARROW_EXPORT S3GlobalOptions {
   int num_event_loop_threads = 1;
 };
 
-/// Initialize the S3 APIs.  It is required to call this function at least once
-/// before using S3FileSystem.
+/// \brief Initialize the S3 APIs.
+///
+/// It is required to call this function at least once before using S3FileSystem.
 ///
 /// Once this function is called you MUST call FinalizeS3 before the end of the
 /// application in order to avoid a segmentation fault at shutdown.
 ARROW_EXPORT
 Status InitializeS3(const S3GlobalOptions& options);
 
-/// Ensure the S3 APIs are initialized, but only if not already done.
+/// \brief Ensure the S3 APIs are initialized, but only if not already done.
+///
 /// If necessary, this will call InitializeS3() with some default options.
 ARROW_EXPORT
 Status EnsureS3Initialized();
@@ -350,11 +353,25 @@ Status EnsureS3Initialized();
 ARROW_EXPORT
 bool IsS3Initialized();
 
-/// Shutdown the S3 APIs.
+/// Whether S3 was finalized.
+ARROW_EXPORT
+bool IsS3Finalized();
+
+/// \brief Shutdown the S3 APIs.
+///
+/// This can wait for some S3 concurrent calls to finish so as to avoid
+/// race conditions.
+/// After this function has been called, all S3 calls will fail with an error.
+///
+/// Calls to InitializeS3() and FinalizeS3() should be serialized by the
+/// application (this also applies to EnsureS3Initialized() and
+/// EnsureS3Finalized()).
 ARROW_EXPORT
 Status FinalizeS3();
 
-/// Ensure the S3 APIs are shutdown, but only if not already done.
+/// \brief Ensure the S3 APIs are shutdown, but only if not already done.
+///
+/// If necessary, this will call FinalizeS3().
 ARROW_EXPORT
 Status EnsureS3Finalized();
 
diff --git a/include/arrow/io/caching.h b/include/arrow/io/caching.h
index 9f047fd..93f8873 100644
--- a/include/arrow/io/caching.h
+++ b/include/arrow/io/caching.h
@@ -43,10 +43,14 @@ struct ARROW_EXPORT CacheOptions {
   int64_t range_size_limit;
   /// \brief A lazy cache does not perform any I/O until requested.
   bool lazy;
+  /// \brief The maximum number of ranges to be prefetched. This is only used
+  ///   for lazy cache to asynchronously read some ranges after reading the target range.
+  int64_t prefetch_limit = 0;
 
   bool operator==(const CacheOptions& other) const {
     return hole_size_limit == other.hole_size_limit &&
-           range_size_limit == other.range_size_limit && lazy == other.lazy;
+           range_size_limit == other.range_size_limit && lazy == other.lazy &&
+           prefetch_limit == other.prefetch_limit;
   }
 
   /// \brief Construct CacheOptions from network storage metrics (e.g. S3).
diff --git a/include/arrow/ipc/reader.h b/include/arrow/ipc/reader.h
index ad7969b..1698abd 100644
--- a/include/arrow/ipc/reader.h
+++ b/include/arrow/ipc/reader.h
@@ -251,7 +251,8 @@ class ARROW_EXPORT Listener {
   /// \see StreamDecoder
   virtual Status OnEOS();
 
-  /// \brief Called when a record batch is decoded.
+  /// \brief Called when a record batch is decoded and
+  /// OnRecordBatchWithMetadataDecoded() isn't overrided.
   ///
   /// The default implementation just returns
   /// arrow::Status::NotImplemented().
@@ -262,6 +263,21 @@ class ARROW_EXPORT Listener {
   /// \see StreamDecoder
   virtual Status OnRecordBatchDecoded(std::shared_ptr<RecordBatch> record_batch);
 
+  /// \brief Called when a record batch with custom metadata is decoded.
+  ///
+  /// The default implementation just calls OnRecordBatchDecoded()
+  /// without custom metadata.
+  ///
+  /// \param[in] record_batch_with_metadata a record batch with custom
+  /// metadata decoded
+  /// \return Status
+  ///
+  /// \see StreamDecoder
+  ///
+  /// \since 13.0.0
+  virtual Status OnRecordBatchWithMetadataDecoded(
+      RecordBatchWithMetadata record_batch_with_metadata);
+
   /// \brief Called when a schema is decoded.
   ///
   /// The default implementation just returns arrow::Status::OK().
@@ -271,6 +287,21 @@ class ARROW_EXPORT Listener {
   ///
   /// \see StreamDecoder
   virtual Status OnSchemaDecoded(std::shared_ptr<Schema> schema);
+
+  /// \brief Called when a schema is decoded.
+  ///
+  /// The default implementation just calls OnSchemaDecoded(schema)
+  /// (without filtered_schema) to keep backward compatibility.
+  ///
+  /// \param[in] schema a schema decoded
+  /// \param[in] filtered_schema a filtered schema that only has read fields
+  /// \return Status
+  ///
+  /// \see StreamDecoder
+  ///
+  /// \since 13.0.0
+  virtual Status OnSchemaDecoded(std::shared_ptr<Schema> schema,
+                                 std::shared_ptr<Schema> filtered_schema);
 };
 
 /// \brief Collect schema and record batches decoded by StreamDecoder.
@@ -280,30 +311,68 @@ class ARROW_EXPORT Listener {
 /// \since 0.17.0
 class ARROW_EXPORT CollectListener : public Listener {
  public:
-  CollectListener() : schema_(), record_batches_() {}
+  CollectListener() : schema_(), filtered_schema_(), record_batches_(), metadatas_() {}
   virtual ~CollectListener() = default;
 
-  Status OnSchemaDecoded(std::shared_ptr<Schema> schema) override {
+  Status OnSchemaDecoded(std::shared_ptr<Schema> schema,
+                         std::shared_ptr<Schema> filtered_schema) override {
     schema_ = std::move(schema);
+    filtered_schema_ = std::move(filtered_schema);
     return Status::OK();
   }
 
-  Status OnRecordBatchDecoded(std::shared_ptr<RecordBatch> record_batch) override {
-    record_batches_.push_back(std::move(record_batch));
+  Status OnRecordBatchWithMetadataDecoded(
+      RecordBatchWithMetadata record_batch_with_metadata) override {
+    record_batches_.push_back(std::move(record_batch_with_metadata.batch));
+    metadatas_.push_back(std::move(record_batch_with_metadata.custom_metadata));
     return Status::OK();
   }
 
   /// \return the decoded schema
   std::shared_ptr<Schema> schema() const { return schema_; }
 
+  /// \return the filtered schema
+  std::shared_ptr<Schema> filtered_schema() const { return filtered_schema_; }
+
   /// \return the all decoded record batches
-  std::vector<std::shared_ptr<RecordBatch>> record_batches() const {
+  const std::vector<std::shared_ptr<RecordBatch>>& record_batches() const {
     return record_batches_;
   }
 
+  /// \return the all decoded metadatas
+  const std::vector<std::shared_ptr<KeyValueMetadata>>& metadatas() const {
+    return metadatas_;
+  }
+
+  /// \return the number of collected record batches
+  int64_t num_record_batches() const { return record_batches_.size(); }
+
+  /// \return the last decoded record batch and remove it from
+  /// record_batches
+  std::shared_ptr<RecordBatch> PopRecordBatch() {
+    auto record_batch_with_metadata = PopRecordBatchWithMetadata();
+    return std::move(record_batch_with_metadata.batch);
+  }
+
+  /// \return the last decoded record batch with custom metadata and
+  /// remove it from record_batches
+  RecordBatchWithMetadata PopRecordBatchWithMetadata() {
+    RecordBatchWithMetadata record_batch_with_metadata;
+    if (record_batches_.empty()) {
+      return record_batch_with_metadata;
+    }
+    record_batch_with_metadata.batch = std::move(record_batches_.back());
+    record_batch_with_metadata.custom_metadata = std::move(metadatas_.back());
+    record_batches_.pop_back();
+    metadatas_.pop_back();
+    return record_batch_with_metadata;
+  }
+
  private:
   std::shared_ptr<Schema> schema_;
+  std::shared_ptr<Schema> filtered_schema_;
   std::vector<std::shared_ptr<RecordBatch>> record_batches_;
+  std::vector<std::shared_ptr<KeyValueMetadata>> metadatas_;
 };
 
 /// \brief Push style stream decoder that receives data from user.
diff --git a/include/arrow/scalar.h b/include/arrow/scalar.h
index d23b33e..0797306 100644
--- a/include/arrow/scalar.h
+++ b/include/arrow/scalar.h
@@ -693,6 +693,9 @@ inline std::shared_ptr<Scalar> MakeScalar(std::string value) {
   return std::make_shared<StringScalar>(std::move(value));
 }
 
+inline std::shared_ptr<Scalar> MakeScalar(const std::shared_ptr<Scalar>& scalar) {
+  return scalar;
+}
 /// @}
 
 template <typename ValueRef>
diff --git a/include/arrow/status.h b/include/arrow/status.h
index 1b9ba28..ac384fc 100644
--- a/include/arrow/status.h
+++ b/include/arrow/status.h
@@ -314,6 +314,12 @@ class ARROW_EXPORT [[nodiscard]] Status : public util::EqualityComparable<Status
   /// The string "OK" is returned for success.
   std::string ToString() const;
 
+  /// \brief Return a string representation of this status without
+  /// context lines suitable for printing.
+  ///
+  /// The string "OK" is returned for success.
+  std::string ToStringWithoutContextLines() const;
+
   /// \brief Return a string representation of the status code, without the message
   /// text or POSIX code information.
   std::string CodeAsString() const;
diff --git a/include/arrow/table.h b/include/arrow/table.h
index 7968f39..2171f33 100644
--- a/include/arrow/table.h
+++ b/include/arrow/table.h
@@ -281,6 +281,9 @@ struct ARROW_EXPORT ConcatenateTablesOptions {
   /// is the result of concatenating the corresponding columns in all input tables.
   bool unify_schemas = false;
 
+  /// options to control how fields are merged when unifying schemas
+  ///
+  /// This field will be ignored if unify_schemas is false
   Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
 
   static ConcatenateTablesOptions Defaults() { return {}; }
diff --git a/include/arrow/testing/gtest_util.h b/include/arrow/testing/gtest_util.h
index 17b5470..e3838bf 100644
--- a/include/arrow/testing/gtest_util.h
+++ b/include/arrow/testing/gtest_util.h
@@ -64,7 +64,7 @@
                     ENUM) ", but got "                                                \
              << _st.ToString();                                                       \
     }                                                                                 \
-    ASSERT_EQ((message), _st.ToString());                                             \
+    ASSERT_EQ((message), _st.ToStringWithoutContextLines());                          \
   } while (false)
 
 #define EXPECT_RAISES_WITH_MESSAGE_THAT(ENUM, matcher, expr)                             \
@@ -73,7 +73,7 @@
     ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res);                      \
     EXPECT_TRUE(_st.Is##ENUM()) << "Expected '" ARROW_STRINGIFY(expr) "' to fail with "  \
                                 << ARROW_STRINGIFY(ENUM) ", but got " << _st.ToString(); \
-    EXPECT_THAT(_st.ToString(), (matcher));                                              \
+    EXPECT_THAT(_st.ToStringWithoutContextLines(), (matcher));                           \
   } while (false)
 
 #define EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(code, matcher, expr) \
@@ -81,7 +81,7 @@
     auto _res = (expr);                                               \
     ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res);   \
     EXPECT_EQ(_st.CodeAsString(), Status::CodeAsString(code));        \
-    EXPECT_THAT(_st.ToString(), (matcher));                           \
+    EXPECT_THAT(_st.ToStringWithoutContextLines(), (matcher));        \
   } while (false)
 
 #define ASSERT_OK(expr)                                                              \
@@ -341,6 +341,10 @@ ARROW_TESTING_EXPORT
 std::shared_ptr<Table> TableFromJSON(const std::shared_ptr<Schema>&,
                                      const std::vector<std::string>& json);
 
+ARROW_TESTING_EXPORT
+Result<std::shared_ptr<Table>> RunEndEncodeTableColumns(
+    const Table& table, const std::vector<int>& column_indices);
+
 // Given an array, return a new identical array except for one validity bit
 // set to a new value.
 // This is useful to force the underlying "value" of null entries to otherwise
diff --git a/include/arrow/type.h b/include/arrow/type.h
index 5608055..d218789 100644
--- a/include/arrow/type.h
+++ b/include/arrow/type.h
@@ -1232,6 +1232,7 @@ class ARROW_EXPORT RunEndEncodedType : public NestedType {
 
   explicit RunEndEncodedType(std::shared_ptr<DataType> run_end_type,
                              std::shared_ptr<DataType> value_type);
+  ~RunEndEncodedType() override;
 
   DataTypeLayout layout() const override {
     // A lot of existing code expects at least one buffer
@@ -1698,6 +1699,22 @@ class ARROW_EXPORT FieldPath {
   /// \brief Retrieve the referenced child from a ChunkedArray
   Result<std::shared_ptr<ChunkedArray>> Get(const ChunkedArray& chunked_array) const;
 
+  /// \brief Retrieve the referenced child/column from an Array, ArrayData, ChunkedArray,
+  /// RecordBatch, or Table
+  ///
+  /// Unlike `FieldPath::Get`, these variants are not zero-copy and the retrieved child's
+  /// null bitmap is ANDed with its ancestors'
+  Result<std::shared_ptr<Array>> GetFlattened(const Array& array,
+                                              MemoryPool* pool = NULLPTR) const;
+  Result<std::shared_ptr<ArrayData>> GetFlattened(const ArrayData& data,
+                                                  MemoryPool* pool = NULLPTR) const;
+  Result<std::shared_ptr<ChunkedArray>> GetFlattened(const ChunkedArray& chunked_array,
+                                                     MemoryPool* pool = NULLPTR) const;
+  Result<std::shared_ptr<Array>> GetFlattened(const RecordBatch& batch,
+                                              MemoryPool* pool = NULLPTR) const;
+  Result<std::shared_ptr<ChunkedArray>> GetFlattened(const Table& table,
+                                                     MemoryPool* pool = NULLPTR) const;
+
  private:
   std::vector<int> indices_;
 };
@@ -1806,6 +1823,20 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable<FieldRef> {
     return true;
   }
 
+  /// \brief Return true if this ref is a name or a nested sequence of only names
+  ///
+  /// Useful for determining if iteration is possible without recursion or inner loops
+  bool IsNameSequence() const {
+    if (IsName()) return true;
+    if (const auto* nested = nested_refs()) {
+      for (const auto& ref : *nested) {
+        if (!ref.IsName()) return false;
+      }
+      return !nested->empty();
+    }
+    return false;
+  }
+
   const FieldPath* field_path() const {
     return IsFieldPath() ? &std::get<FieldPath>(impl_) : NULLPTR;
   }
@@ -1885,6 +1916,20 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable<FieldRef> {
     }
     return out;
   }
+  /// \brief Get all children matching this FieldRef.
+  ///
+  /// Unlike `FieldRef::GetAll`, this variant is not zero-copy and the retrieved
+  /// children's null bitmaps are ANDed with their ancestors'
+  template <typename T>
+  Result<std::vector<GetType<T>>> GetAllFlattened(const T& root,
+                                                  MemoryPool* pool = NULLPTR) const {
+    std::vector<GetType<T>> out;
+    for (const auto& match : FindAll(root)) {
+      ARROW_ASSIGN_OR_RAISE(auto child, match.GetFlattened(root, pool));
+      out.push_back(std::move(child));
+    }
+    return out;
+  }
 
   /// \brief Get the single child matching this FieldRef.
   /// Emit an error if none or multiple match.
@@ -1893,6 +1938,15 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable<FieldRef> {
     ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root));
     return match.Get(root).ValueOrDie();
   }
+  /// \brief Get the single child matching this FieldRef.
+  ///
+  /// Unlike `FieldRef::GetOne`, this variant is not zero-copy and the retrieved
+  /// child's null bitmap is ANDed with its ancestors'
+  template <typename T>
+  Result<GetType<T>> GetOneFlattened(const T& root, MemoryPool* pool = NULLPTR) const {
+    ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root));
+    return match.GetFlattened(root, pool);
+  }
 
   /// \brief Get the single child matching this FieldRef.
   /// Return nullptr if none match, emit an error if multiple match.
@@ -1904,6 +1958,20 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable<FieldRef> {
     }
     return match.Get(root).ValueOrDie();
   }
+  /// \brief Get the single child matching this FieldRef.
+  ///
+  /// Return nullptr if none match, emit an error if multiple match.
+  /// Unlike `FieldRef::GetOneOrNone`, this variant is not zero-copy and the
+  /// retrieved child's null bitmap is ANDed with its ancestors'
+  template <typename T>
+  Result<GetType<T>> GetOneOrNoneFlattened(const T& root,
+                                           MemoryPool* pool = NULLPTR) const {
+    ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root));
+    if (match.empty()) {
+      return static_cast<GetType<T>>(NULLPTR);
+    }
+    return match.GetFlattened(root, pool);
+  }
 
  private:
   void Flatten(std::vector<FieldRef> children);
diff --git a/include/arrow/type_traits.h b/include/arrow/type_traits.h
index 7204fd6..fdceca0 100644
--- a/include/arrow/type_traits.h
+++ b/include/arrow/type_traits.h
@@ -709,7 +709,7 @@ using enable_if_list_type = enable_if_t<is_list_type<T>::value, R>;
 
 template <typename T>
 using is_list_like_type =
-    std::integral_constant<bool, is_base_list_type<T>::value ||
+    std::integral_constant<bool, is_var_length_list_type<T>::value ||
                                      is_fixed_size_list_type<T>::value>;
 
 template <typename T, typename R = void>
@@ -1179,6 +1179,22 @@ constexpr bool is_fixed_width(Type::type type_id) {
   return is_primitive(type_id) || is_dictionary(type_id) || is_fixed_size_binary(type_id);
 }
 
+/// \brief Check for a variable-length list type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a variable-length list type one
+constexpr bool is_var_length_list(Type::type type_id) {
+  switch (type_id) {
+    case Type::LIST:
+    case Type::LARGE_LIST:
+    case Type::MAP:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
 /// \brief Check for a list-like type
 ///
 /// \param[in] type_id the type-id to check
@@ -1484,6 +1500,16 @@ static inline bool is_fixed_width(const DataType& type) {
   return is_fixed_width(type.id());
 }
 
+/// \brief Check for a variable-length list type
+///
+/// \param[in] type the type to check
+/// \return whether type is a variable-length list type
+///
+/// Convenience for checking using the type's id
+static inline bool is_var_length_list(const DataType& type) {
+  return is_var_length_list(type.id());
+}
+
 /// \brief Check for a list-like type
 ///
 /// \param[in] type the type to check
diff --git a/include/arrow/util/basic_decimal.h b/include/arrow/util/basic_decimal.h
index b071c5f..b263bb2 100644
--- a/include/arrow/util/basic_decimal.h
+++ b/include/arrow/util/basic_decimal.h
@@ -45,13 +45,16 @@ class ARROW_EXPORT GenericBasicDecimal {
 
 #if ARROW_LITTLE_ENDIAN
   static constexpr int kHighWordIndex = NWORDS - 1;
+  static constexpr int kLowWordIndex = 0;
 #else
   static constexpr int kHighWordIndex = 0;
+  static constexpr int kLowWordIndex = NWORDS - 1;
 #endif
 
  public:
   static constexpr int kBitWidth = BIT_WIDTH;
   static constexpr int kByteWidth = kBitWidth / 8;
+  static constexpr int kNumWords = NWORDS;
 
   // A constructor tag to introduce a little-endian encoded array
   static constexpr LittleEndianArrayTag LittleEndianArray{};
@@ -64,8 +67,7 @@ class ARROW_EXPORT GenericBasicDecimal {
   /// \brief Create a decimal from the two's complement representation.
   ///
   /// Input array is assumed to be in native endianness.
-  constexpr GenericBasicDecimal(
-      const WordArray& array) noexcept  // NOLINT(runtime/explicit)
+  explicit constexpr GenericBasicDecimal(const WordArray& array) noexcept
       : array_(array) {}
 
   /// \brief Create a decimal from the two's complement representation.
@@ -74,6 +76,13 @@ class ARROW_EXPORT GenericBasicDecimal {
   GenericBasicDecimal(LittleEndianArrayTag, const WordArray& array) noexcept
       : GenericBasicDecimal(bit_util::little_endian::ToNative(array)) {}
 
+  /// \brief Create a decimal from any integer not wider than 64 bits.
+  template <typename T,
+            typename = typename std::enable_if<
+                std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
+  constexpr GenericBasicDecimal(T value) noexcept  // NOLINT(runtime/explicit)
+      : array_(WordsFromLowBits(value)) {}
+
   /// \brief Create a decimal from an array of bytes.
   ///
   /// Bytes are assumed to be in native-endian byte order.
@@ -124,8 +133,37 @@ class ARROW_EXPORT GenericBasicDecimal {
 
   bool IsNegative() const { return static_cast<int64_t>(array_[kHighWordIndex]) < 0; }
 
+  explicit operator bool() const { return array_ != WordArray{}; }
+
+  friend bool operator==(const GenericBasicDecimal& left,
+                         const GenericBasicDecimal& right) {
+    return left.array_ == right.array_;
+  }
+
+  friend bool operator!=(const GenericBasicDecimal& left,
+                         const GenericBasicDecimal& right) {
+    return left.array_ != right.array_;
+  }
+
  protected:
   WordArray array_;
+
+  template <typename T>
+  static constexpr uint64_t SignExtend(T low_bits) noexcept {
+    return low_bits >= T{} ? uint64_t{0} : ~uint64_t{0};
+  }
+
+  template <typename T>
+  static constexpr WordArray WordsFromLowBits(T low_bits) {
+    WordArray words{};
+    if (low_bits < T{}) {
+      for (auto& word : words) {
+        word = ~uint64_t{0};
+      }
+    }
+    words[kLowWordIndex] = static_cast<uint64_t>(low_bits);
+    return words;
+  }
 };
 
 /// Represents a signed 128-bit integer in two's complement.
@@ -150,14 +188,6 @@ class ARROW_EXPORT BasicDecimal128 : public GenericBasicDecimal<BasicDecimal128,
       : BasicDecimal128(WordArray{static_cast<uint64_t>(high), low}) {}
 #endif
 
-  /// \brief Convert any integer value into a BasicDecimal128.
-  template <typename T,
-            typename = typename std::enable_if<
-                std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
-  constexpr BasicDecimal128(T value) noexcept  // NOLINT(runtime/explicit)
-      : BasicDecimal128(value >= T{0} ? 0 : -1, static_cast<uint64_t>(value)) {  // NOLINT
-  }
-
   /// \brief Negate the current value (in-place)
   BasicDecimal128& Negate();
 
@@ -208,7 +238,9 @@ class ARROW_EXPORT BasicDecimal128 : public GenericBasicDecimal<BasicDecimal128,
     return res;
   }
 
-  /// \brief Shift right by the given number of bits. Negative values will
+  /// \brief Shift right by the given number of bits.
+  ///
+  /// Negative values will sign-extend.
   BasicDecimal128& operator>>=(uint32_t bits);
 
   BasicDecimal128 operator>>(uint32_t bits) const {
@@ -284,8 +316,6 @@ class ARROW_EXPORT BasicDecimal128 : public GenericBasicDecimal<BasicDecimal128,
   }
 };
 
-ARROW_EXPORT bool operator==(const BasicDecimal128& left, const BasicDecimal128& right);
-ARROW_EXPORT bool operator!=(const BasicDecimal128& left, const BasicDecimal128& right);
 ARROW_EXPORT bool operator<(const BasicDecimal128& left, const BasicDecimal128& right);
 ARROW_EXPORT bool operator<=(const BasicDecimal128& left, const BasicDecimal128& right);
 ARROW_EXPORT bool operator>(const BasicDecimal128& left, const BasicDecimal128& right);
@@ -305,14 +335,6 @@ ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left,
                                        const BasicDecimal128& right);
 
 class ARROW_EXPORT BasicDecimal256 : public GenericBasicDecimal<BasicDecimal256, 256> {
- private:
-  // Due to a bug in clang, we have to declare the extend method prior to its
-  // usage.
-  template <typename T>
-  static constexpr uint64_t extend(T low_bits) noexcept {
-    return low_bits >= T() ? uint64_t{0} : ~uint64_t{0};
-  }
-
  public:
   using GenericBasicDecimal::GenericBasicDecimal;
 
@@ -321,19 +343,10 @@ class ARROW_EXPORT BasicDecimal256 : public GenericBasicDecimal<BasicDecimal256,
 
   constexpr BasicDecimal256() noexcept : GenericBasicDecimal() {}
 
-  /// \brief Convert any integer value into a BasicDecimal256.
-  template <typename T,
-            typename = typename std::enable_if<
-                std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
-  constexpr BasicDecimal256(T value) noexcept  // NOLINT(runtime/explicit)
-      : BasicDecimal256(bit_util::little_endian::ToNative<uint64_t, 4>(
-            {static_cast<uint64_t>(value), extend(value), extend(value),
-             extend(value)})) {}
-
   explicit BasicDecimal256(const BasicDecimal128& value) noexcept
       : BasicDecimal256(bit_util::little_endian::ToNative<uint64_t, 4>(
             {value.low_bits(), static_cast<uint64_t>(value.high_bits()),
-             extend(value.high_bits()), extend(value.high_bits())})) {}
+             SignExtend(value.high_bits()), SignExtend(value.high_bits())})) {}
 
   /// \brief Negate the current value (in-place)
   BasicDecimal256& Negate();
@@ -403,6 +416,17 @@ class ARROW_EXPORT BasicDecimal256 : public GenericBasicDecimal<BasicDecimal256,
     return res;
   }
 
+  /// \brief Shift right by the given number of bits.
+  ///
+  /// Negative values will sign-extend.
+  BasicDecimal256& operator>>=(uint32_t bits);
+
+  BasicDecimal256 operator>>(uint32_t bits) const {
+    auto res = *this;
+    res >>= bits;
+    return res;
+  }
+
   /// \brief In-place division.
   BasicDecimal256& operator/=(const BasicDecimal256& right);
 
@@ -435,16 +459,6 @@ class ARROW_EXPORT BasicDecimal256 : public GenericBasicDecimal<BasicDecimal256,
   }
 };
 
-ARROW_EXPORT inline bool operator==(const BasicDecimal256& left,
-                                    const BasicDecimal256& right) {
-  return left.native_endian_array() == right.native_endian_array();
-}
-
-ARROW_EXPORT inline bool operator!=(const BasicDecimal256& left,
-                                    const BasicDecimal256& right) {
-  return left.native_endian_array() != right.native_endian_array();
-}
-
 ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256& right);
 
 ARROW_EXPORT inline bool operator<=(const BasicDecimal256& left,
diff --git a/include/arrow/util/benchmark_util.h b/include/arrow/util/benchmark_util.h
index 319730f..2a3dcf5 100644
--- a/include/arrow/util/benchmark_util.h
+++ b/include/arrow/util/benchmark_util.h
@@ -108,7 +108,7 @@ void RegressionSetArgs(benchmark::internal::Benchmark* bench) {
 // RAII struct to handle some of the boilerplate in regression benchmarks
 struct RegressionArgs {
   // size of memory tested (per iteration) in bytes
-  const int64_t size;
+  int64_t size;
 
   // proportion of nulls in generated arrays
   double null_proportion;
diff --git a/include/arrow/util/bitmap_writer.h b/include/arrow/util/bitmap_writer.h
index 65d0d18..c9ce801 100644
--- a/include/arrow/util/bitmap_writer.h
+++ b/include/arrow/util/bitmap_writer.h
@@ -124,7 +124,7 @@ class FirstTimeBitmapWriter {
       // unset so no additional accounting is needed for when number_of_bits <
       // bits_to_carry.
       current_byte_ |= (word & bit_util::kPrecedingBitmask[bits_to_carry]) << bit_offset;
-      // Check if everything is transfered into current_byte_.
+      // Check if everything is transferred into current_byte_.
       if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
         return;
       }
diff --git a/include/arrow/util/config.h b/include/arrow/util/config.h
index 567e267..2c8d1a3 100644
--- a/include/arrow/util/config.h
+++ b/include/arrow/util/config.h
@@ -15,15 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#define ARROW_VERSION_MAJOR 12
+#define ARROW_VERSION_MAJOR 13
 #define ARROW_VERSION_MINOR 0
-#define ARROW_VERSION_PATCH 1
+#define ARROW_VERSION_PATCH 0
 #define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH
 
-#define ARROW_VERSION_STRING "12.0.1"
+#define ARROW_VERSION_STRING "13.0.0"
 
-#define ARROW_SO_VERSION "1200"
-#define ARROW_FULL_SO_VERSION "1200.1.0"
+#define ARROW_SO_VERSION "1300"
+#define ARROW_FULL_SO_VERSION "1300.0.0"
 
 #define ARROW_CXX_COMPILER_ID "GNU"
 #define ARROW_CXX_COMPILER_VERSION "8.3.0"
@@ -31,8 +31,8 @@
 
 #define ARROW_BUILD_TYPE "RELEASE"
 
-#define ARROW_GIT_ID "ac8331f7b1f8b7efdb890c4dc50ce0efd1637778"
-#define ARROW_GIT_DESCRIPTION "v8-9.0.257.17-83-gac8331f"
+#define ARROW_GIT_ID "1da6c2a8a81acc51f3ca4bbad3e5f590f9153f59"
+#define ARROW_GIT_DESCRIPTION "v8-9.0.257.17-98-g1da6c2a"
 
 #define ARROW_PACKAGE_KIND ""
 
diff --git a/include/arrow/util/decimal.h b/include/arrow/util/decimal.h
index fd42a01..345c74d 100644
--- a/include/arrow/util/decimal.h
+++ b/include/arrow/util/decimal.h
@@ -146,9 +146,15 @@ class ARROW_EXPORT Decimal128 : public BasicDecimal128 {
   double ToDouble(int32_t scale) const;
 
   /// \brief Convert to a floating-point number (scaled)
-  template <typename T>
+  template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
   T ToReal(int32_t scale) const {
-    return ToRealConversion<T>::ToReal(*this, scale);
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                  "Unexpected floating-point type");
+    if constexpr (std::is_same_v<T, float>) {
+      return ToFloat(scale);
+    } else {
+      return ToDouble(scale);
+    }
   }
 
   ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
@@ -157,21 +163,6 @@ class ARROW_EXPORT Decimal128 : public BasicDecimal128 {
  private:
   /// Converts internal error code to Status
   Status ToArrowStatus(DecimalStatus dstatus) const;
-
-  template <typename T>
-  struct ToRealConversion {};
-};
-
-template <>
-struct Decimal128::ToRealConversion<float> {
-  static float ToReal(const Decimal128& dec, int32_t scale) { return dec.ToFloat(scale); }
-};
-
-template <>
-struct Decimal128::ToRealConversion<double> {
-  static double ToReal(const Decimal128& dec, int32_t scale) {
-    return dec.ToDouble(scale);
-  }
 };
 
 /// Represents a signed 256-bit integer in two's complement.
@@ -262,9 +253,15 @@ class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
   double ToDouble(int32_t scale) const;
 
   /// \brief Convert to a floating-point number (scaled)
-  template <typename T>
+  template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
   T ToReal(int32_t scale) const {
-    return ToRealConversion<T>::ToReal(*this, scale);
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                  "Unexpected floating-point type");
+    if constexpr (std::is_same_v<T, float>) {
+      return ToFloat(scale);
+    } else {
+      return ToDouble(scale);
+    }
   }
 
   ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
@@ -273,21 +270,6 @@ class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
  private:
   /// Converts internal error code to Status
   Status ToArrowStatus(DecimalStatus dstatus) const;
-
-  template <typename T>
-  struct ToRealConversion {};
-};
-
-template <>
-struct Decimal256::ToRealConversion<float> {
-  static float ToReal(const Decimal256& dec, int32_t scale) { return dec.ToFloat(scale); }
-};
-
-template <>
-struct Decimal256::ToRealConversion<double> {
-  static double ToReal(const Decimal256& dec, int32_t scale) {
-    return dec.ToDouble(scale);
-  }
 };
 
 /// For an integer type, return the max number of decimal digits
diff --git a/include/arrow/util/double_conversion.h b/include/arrow/util/double_conversion.h
index 8edc654..0b07b1a 100644
--- a/include/arrow/util/double_conversion.h
+++ b/include/arrow/util/double_conversion.h
@@ -23,9 +23,9 @@ namespace arrow {
 namespace util {
 namespace double_conversion {
 
-using ::double_conversion::DoubleToStringConverter;
-using ::double_conversion::StringBuilder;
-using ::double_conversion::StringToDoubleConverter;
+using ::arrow_vendored::double_conversion::DoubleToStringConverter;
+using ::arrow_vendored::double_conversion::StringBuilder;
+using ::arrow_vendored::double_conversion::StringToDoubleConverter;
 
 }  // namespace double_conversion
 }  // namespace util
diff --git a/include/arrow/util/formatting.h b/include/arrow/util/formatting.h
index 66d8119..9dcc646 100644
--- a/include/arrow/util/formatting.h
+++ b/include/arrow/util/formatting.h
@@ -135,8 +135,12 @@ void FormatOneDigit(Int value, char** cursor) {
   FormatOneChar(static_cast<char>('0' + value), cursor);
 }
 
+// GH-35662: I don't know why but the following combination causes SEGV:
+// * template implementation without inline
+// * MinGW
+// * Release build
 template <typename Int>
-void FormatTwoDigits(Int value, char** cursor) {
+inline void FormatTwoDigits(Int value, char** cursor) {
   assert(value >= 0 && value <= 99);
   auto digit_pair = &digit_pairs[value * 2];
   FormatOneChar(digit_pair[1], cursor);
diff --git a/include/arrow/util/hashing.h b/include/arrow/util/hashing.h
index 6656492..2de9f41 100644
--- a/include/arrow/util/hashing.h
+++ b/include/arrow/util/hashing.h
@@ -63,6 +63,23 @@ typedef uint64_t hash_t;
 template <uint64_t AlgNum>
 inline hash_t ComputeStringHash(const void* data, int64_t length);
 
+/// \brief A hash function for bitmaps that can handle offsets and lengths in
+/// terms of number of bits. The hash only depends on the bits actually hashed.
+///
+/// It's the caller's responsibility to ensure that bits_offset + num_bits are
+/// readable from the bitmap.
+///
+/// \pre bits_offset >= 0
+/// \pre num_bits >= 0
+/// \pre (bits_offset + num_bits + 7) / 8 <= readable length in bytes from bitmap
+///
+/// \param bitmap The pointer to the bitmap.
+/// \param seed The seed for the hash function (useful when chaining hash functions).
+/// \param bits_offset The offset in bits relative to the start of the bitmap.
+/// \param num_bits The number of bits after the offset to be hashed.
+ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed,
+                                      int64_t bits_offset, int64_t num_bits);
+
 template <typename Scalar, uint64_t AlgNum>
 struct ScalarHelperBase {
   static bool CompareScalars(Scalar u, Scalar v) { return u == v; }
diff --git a/include/arrow/util/range.h b/include/arrow/util/range.h
index ea0fb0e..2055328 100644
--- a/include/arrow/util/range.h
+++ b/include/arrow/util/range.h
@@ -21,11 +21,11 @@
 #include <cstdint>
 #include <iterator>
 #include <numeric>
+#include <tuple>
 #include <utility>
 #include <vector>
 
-namespace arrow {
-namespace internal {
+namespace arrow::internal {
 
 /// Create a vector containing the values from start up to stop
 template <typename T>
@@ -151,5 +151,108 @@ LazyRange<Generator> MakeLazyRange(Generator&& gen, int64_t length) {
   return LazyRange<Generator>(std::forward<Generator>(gen), length);
 }
 
-}  // namespace internal
-}  // namespace arrow
+/// \brief A helper for iterating multiple ranges simultaneously, similar to C++23's
+/// zip() view adapter modelled after python's built-in zip() function.
+///
+/// \code {.cpp}
+/// const std::vector<SomeTable>& tables = ...
+/// std::function<std::vector<std::string>()> GetNames = ...
+/// for (auto [table, name] : Zip(tables, GetNames())) {
+///   static_assert(std::is_same_v<decltype(table), const SomeTable&>);
+///   static_assert(std::is_same_v<decltype(name), std::string&>);
+///   // temporaries (like this vector of strings) are kept alive for the
+///   // duration of a loop and are safely movable).
+///   RegisterTableWithName(std::move(name), &table);
+/// }
+/// \endcode
+///
+/// The zipped sequence ends as soon as any of its member ranges ends.
+///
+/// Always use `auto` for the loop's declaration; it will always be a tuple
+/// of references so for example using `const auto&` will compile but will
+/// *look* like forcing const-ness even though the members of the tuple are
+/// still mutable references.
+///
+/// NOTE: we *could* make Zip a more full fledged range and enable things like
+/// - gtest recognizing it as a container; it currently doesn't since Zip is
+///   always mutable so this breaks:
+///       EXPECT_THAT(Zip(std::vector{0}, std::vector{1}),
+///                   ElementsAre(std::tuple{0, 1}));
+/// - letting it be random access when possible so we can do things like *sort*
+///   parallel ranges
+/// - ...
+///
+/// However doing this will increase the compile time overhead of using Zip as
+/// long as we're still using headers. Therefore until we can use c++20 modules:
+/// *don't* extend Zip.
+template <typename Ranges, typename Indices>
+struct Zip;
+
+template <typename... Ranges>
+Zip(Ranges&&...) -> Zip<std::tuple<Ranges...>, std::index_sequence_for<Ranges...>>;
+
+template <typename... Ranges, size_t... I>
+struct Zip<std::tuple<Ranges...>, std::index_sequence<I...>> {
+  explicit Zip(Ranges... ranges) : ranges_(std::forward<Ranges>(ranges)...) {}
+
+  std::tuple<Ranges...> ranges_;
+
+  using sentinel = std::tuple<decltype(std::end(std::get<I>(ranges_)))...>;
+  constexpr sentinel end() { return {std::end(std::get<I>(ranges_))...}; }
+
+  struct iterator : std::tuple<decltype(std::begin(std::get<I>(ranges_)))...> {
+    using std::tuple<decltype(std::begin(std::get<I>(ranges_)))...>::tuple;
+
+    constexpr auto operator*() {
+      return std::tuple<decltype(*std::get<I>(*this))...>{*std::get<I>(*this)...};
+    }
+
+    constexpr iterator& operator++() {
+      (++std::get<I>(*this), ...);
+      return *this;
+    }
+
+    constexpr bool operator!=(const sentinel& s) const {
+      bool all_iterators_valid = (... && (std::get<I>(*this) != std::get<I>(s)));
+      return all_iterators_valid;
+    }
+  };
+  constexpr iterator begin() { return {std::begin(std::get<I>(ranges_))...}; }
+};
+
+/// \brief A lazy sequence of integers which starts from 0 and never stops.
+///
+/// This can be used in conjunction with Zip() to emulate python's built-in
+/// enumerate() function:
+///
+/// \code {.cpp}
+/// const std::vector<SomeTable>& tables = ...
+/// for (auto [i, table] : Zip(Enumerate<>, tables)) {
+///   std::cout << "#" << i << ": " << table.name() << std::endl;
+/// }
+/// \endcode
+template <typename I = size_t>
+constexpr auto Enumerate = [] {
+  struct {
+    struct sentinel {};
+    constexpr sentinel end() const { return {}; }
+
+    struct iterator {
+      I value{0};
+
+      constexpr I operator*() { return value; }
+
+      constexpr iterator& operator++() {
+        ++value;
+        return *this;
+      }
+
+      constexpr std::true_type operator!=(sentinel) const { return {}; }
+    };
+    constexpr iterator begin() const { return {}; }
+  } out;
+
+  return out;
+}();
+
+}  // namespace arrow::internal
diff --git a/include/arrow/util/ree_util.h b/include/arrow/util/ree_util.h
index 8c8f58a..e708eb0 100644
--- a/include/arrow/util/ree_util.h
+++ b/include/arrow/util/ree_util.h
@@ -227,20 +227,39 @@ class RunEndEncodedArraySpan {
     int64_t physical_pos_;
   };
 
-  explicit RunEndEncodedArraySpan(const ArrayData& data)
-      : RunEndEncodedArraySpan(ArraySpan{data}) {}
+  // Prevent implicit ArrayData -> ArraySpan conversion in
+  // RunEndEncodedArraySpan instantiation.
+  explicit RunEndEncodedArraySpan(const ArrayData& data) = delete;
 
-  explicit RunEndEncodedArraySpan(const ArraySpan& array_span)
-      : array_span{array_span}, run_ends_(RunEnds<RunEndCType>(array_span)) {
-    assert(array_span.type->id() == Type::RUN_END_ENCODED);
+  /// \brief Construct a RunEndEncodedArraySpan from an ArraySpan and new
+  /// absolute offset and length.
+  ///
+  /// RunEndEncodedArraySpan{span, off, len} is equivalent to:
+  ///
+  ///   span.SetSlice(off, len);
+  ///   RunEndEncodedArraySpan{span}
+  ///
+  /// ArraySpan::SetSlice() updates the null_count to kUnknownNullCount, but
+  /// we don't need that here as REE arrays have null_count set to 0 by
+  /// convention.
+  explicit RunEndEncodedArraySpan(const ArraySpan& array_span, int64_t offset,
+                                  int64_t length)
+      : array_span_{array_span},
+        run_ends_(RunEnds<RunEndCType>(array_span_)),
+        length_(length),
+        offset_(offset) {
+    assert(array_span_.type->id() == Type::RUN_END_ENCODED);
   }
 
-  int64_t length() const { return array_span.length; }
-  int64_t offset() const { return array_span.offset; }
+  explicit RunEndEncodedArraySpan(const ArraySpan& array_span)
+      : RunEndEncodedArraySpan(array_span, array_span.offset, array_span.length) {}
+
+  int64_t offset() const { return offset_; }
+  int64_t length() const { return length_; }
 
   int64_t PhysicalIndex(int64_t logical_pos) const {
-    return internal::FindPhysicalIndex(run_ends_, RunEndsArray(array_span).length,
-                                       logical_pos, offset());
+    return internal::FindPhysicalIndex(run_ends_, RunEndsArray(array_span_).length,
+                                       logical_pos, offset_);
   }
 
   /// \brief Create an iterator from a logical position and its
@@ -296,9 +315,9 @@ class RunEndEncodedArraySpan {
                     (length() == 0) ? PhysicalIndex(0) : PhysicalIndex(length() - 1) + 1);
   }
 
-  // Pre-condition: physical_pos < RunEndsArray(array_span).length);
+  // Pre-condition: physical_pos < RunEndsArray(array_span_).length);
   inline int64_t run_end(int64_t physical_pos) const {
-    assert(physical_pos < RunEndsArray(array_span).length);
+    assert(physical_pos < RunEndsArray(array_span_).length);
     // Logical index of the end of the run at physical_pos with offset applied
     const int64_t logical_run_end =
         std::max<int64_t>(static_cast<int64_t>(run_ends_[physical_pos]) - offset(), 0);
@@ -306,11 +325,11 @@ class RunEndEncodedArraySpan {
     return std::min(logical_run_end, length());
   }
 
- public:
-  const ArraySpan array_span;
-
  private:
+  const ArraySpan& array_span_;
   const RunEndCType* run_ends_;
+  const int64_t length_;
+  const int64_t offset_;
 };
 
 /// \brief Iterate over two run-end encoded arrays in runs or sub-runs that are
diff --git a/include/arrow/util/utf8.h b/include/arrow/util/utf8.h
index 44d33e3..ca93fab 100644
--- a/include/arrow/util/utf8.h
+++ b/include/arrow/util/utf8.h
@@ -36,6 +36,12 @@ ARROW_EXPORT Result<std::wstring> UTF8ToWideString(std::string_view source);
 // Similarly, convert a wstring to a UTF8 string.
 ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source);
 
+// Convert UTF8 string to a UTF16 string.
+ARROW_EXPORT Result<std::u16string> UTF8StringToUTF16(std::string_view source);
+
+// Convert UTF16 string to a UTF8 string.
+ARROW_EXPORT Result<std::string> UTF16StringToUTF8(std::u16string_view source);
+
 // This function needs to be called before doing UTF8 validation.
 ARROW_EXPORT void InitializeUTF8();
 
diff --git a/include/arrow/vendored/double-conversion/bignum-dtoa.h b/include/arrow/vendored/double-conversion/bignum-dtoa.h
index 34b9619..f56239e 100644
--- a/include/arrow/vendored/double-conversion/bignum-dtoa.h
+++ b/include/arrow/vendored/double-conversion/bignum-dtoa.h
@@ -30,6 +30,7 @@
 
 #include "utils.h"
 
+namespace arrow_vendored {
 namespace double_conversion {
 
 enum BignumDtoaMode {
@@ -80,5 +81,6 @@ void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits,
                 Vector<char> buffer, int* length, int* point);
 
 }  // namespace double_conversion
+}  // namespace arrow_vendored
 
 #endif  // DOUBLE_CONVERSION_BIGNUM_DTOA_H_
diff --git a/include/arrow/vendored/double-conversion/bignum.h b/include/arrow/vendored/double-conversion/bignum.h
index 7c289fa..0bedb63 100644
--- a/include/arrow/vendored/double-conversion/bignum.h
+++ b/include/arrow/vendored/double-conversion/bignum.h
@@ -30,6 +30,7 @@
 
 #include "utils.h"
 
+namespace arrow_vendored {
 namespace double_conversion {
 
 class Bignum {
@@ -39,26 +40,27 @@ class Bignum {
   // exponent.
   static const int kMaxSignificantBits = 3584;
 
-  Bignum();
-  void AssignUInt16(uint16_t value);
+  Bignum() : used_bigits_(0), exponent_(0) {}
+
+  void AssignUInt16(const uint16_t value);
   void AssignUInt64(uint64_t value);
   void AssignBignum(const Bignum& other);
 
-  void AssignDecimalString(Vector<const char> value);
-  void AssignHexString(Vector<const char> value);
+  void AssignDecimalString(const Vector<const char> value);
+  void AssignHexString(const Vector<const char> value);
 
-  void AssignPowerUInt16(uint16_t base, int exponent);
+  void AssignPowerUInt16(uint16_t base, const int exponent);
 
-  void AddUInt64(uint64_t operand);
+  void AddUInt64(const uint64_t operand);
   void AddBignum(const Bignum& other);
   // Precondition: this >= other.
   void SubtractBignum(const Bignum& other);
 
   void Square();
-  void ShiftLeft(int shift_amount);
-  void MultiplyByUInt32(uint32_t factor);
-  void MultiplyByUInt64(uint64_t factor);
-  void MultiplyByPowerOfTen(int exponent);
+  void ShiftLeft(const int shift_amount);
+  void MultiplyByUInt32(const uint32_t factor);
+  void MultiplyByUInt64(const uint64_t factor);
+  void MultiplyByPowerOfTen(const int exponent);
   void Times10() { return MultiplyByUInt32(10); }
   // Pseudocode:
   //  int result = this / other;
@@ -66,7 +68,7 @@ class Bignum {
   // In the worst case this function is in O(this/other).
   uint16_t DivideModuloIntBignum(const Bignum& other);
 
-  bool ToHexString(char* buffer, int buffer_size) const;
+  bool ToHexString(char* buffer, const int buffer_size) const;
 
   // Returns
   //  -1 if a < b,
@@ -110,35 +112,43 @@ class Bignum {
   // grow. There are no checks if the stack-allocated space is sufficient.
   static const int kBigitCapacity = kMaxSignificantBits / kBigitSize;
 
-  void EnsureCapacity(int size) {
+  static void EnsureCapacity(const int size) {
     if (size > kBigitCapacity) {
-      UNREACHABLE();
+      DOUBLE_CONVERSION_UNREACHABLE();
     }
   }
   void Align(const Bignum& other);
   void Clamp();
-  bool IsClamped() const;
-  void Zero();
+  bool IsClamped() const {
+    return used_bigits_ == 0 || RawBigit(used_bigits_ - 1) != 0;
+  }
+  void Zero() {
+    used_bigits_ = 0;
+    exponent_ = 0;
+  }
   // Requires this to have enough capacity (no tests done).
-  // Updates used_digits_ if necessary.
+  // Updates used_bigits_ if necessary.
   // shift_amount must be < kBigitSize.
-  void BigitsShiftLeft(int shift_amount);
-  // BigitLength includes the "hidden" digits encoded in the exponent.
-  int BigitLength() const { return used_digits_ + exponent_; }
-  Chunk BigitAt(int index) const;
-  void SubtractTimes(const Bignum& other, int factor);
-
+  void BigitsShiftLeft(const int shift_amount);
+  // BigitLength includes the "hidden" bigits encoded in the exponent.
+  int BigitLength() const { return used_bigits_ + exponent_; }
+  Chunk& RawBigit(const int index);
+  const Chunk& RawBigit(const int index) const;
+  Chunk BigitOrZero(const int index) const;
+  void SubtractTimes(const Bignum& other, const int factor);
+
+  // The Bignum's value is value(bigits_buffer_) * 2^(exponent_ * kBigitSize),
+  // where the value of the buffer consists of the lower kBigitSize bits of
+  // the first used_bigits_ Chunks in bigits_buffer_, first chunk has lowest
+  // significant bits.
+  int16_t used_bigits_;
+  int16_t exponent_;
   Chunk bigits_buffer_[kBigitCapacity];
-  // A vector backed by bigits_buffer_. This way accesses to the array are
-  // checked for out-of-bounds errors.
-  Vector<Chunk> bigits_;
-  int used_digits_;
-  // The Bignum's value equals value(bigits_) * 2^(exponent_ * kBigitSize).
-  int exponent_;
-
-  DC_DISALLOW_COPY_AND_ASSIGN(Bignum);
+
+  DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(Bignum);
 };
 
 }  // namespace double_conversion
+}  // namespace arrow_vendored
 
 #endif  // DOUBLE_CONVERSION_BIGNUM_H_
diff --git a/include/arrow/vendored/double-conversion/cached-powers.h b/include/arrow/vendored/double-conversion/cached-powers.h
index 61a5061..68fd82d 100644
--- a/include/arrow/vendored/double-conversion/cached-powers.h
+++ b/include/arrow/vendored/double-conversion/cached-powers.h
@@ -30,35 +30,37 @@
 
 #include "diy-fp.h"
 
+namespace arrow_vendored {
 namespace double_conversion {
 
-class PowersOfTenCache {
- public:
+namespace PowersOfTenCache {
 
   // Not all powers of ten are cached. The decimal exponent of two neighboring
   // cached numbers will differ by kDecimalExponentDistance.
-  static const int kDecimalExponentDistance;
+  static const int kDecimalExponentDistance = 8;
 
-  static const int kMinDecimalExponent;
-  static const int kMaxDecimalExponent;
+  static const int kMinDecimalExponent = -348;
+  static const int kMaxDecimalExponent = 340;
 
   // Returns a cached power-of-ten with a binary exponent in the range
   // [min_exponent; max_exponent] (boundaries included).
-  static void GetCachedPowerForBinaryExponentRange(int min_exponent,
-                                                   int max_exponent,
-                                                   DiyFp* power,
-                                                   int* decimal_exponent);
+  void GetCachedPowerForBinaryExponentRange(int min_exponent,
+                                            int max_exponent,
+                                            DiyFp* power,
+                                            int* decimal_exponent);
 
   // Returns a cached power of ten x ~= 10^k such that
   //   k <= decimal_exponent < k + kCachedPowersDecimalDistance.
   // The given decimal_exponent must satisfy
   //   kMinDecimalExponent <= requested_exponent, and
   //   requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance.
-  static void GetCachedPowerForDecimalExponent(int requested_exponent,
-                                               DiyFp* power,
-                                               int* found_exponent);
-};
+  void GetCachedPowerForDecimalExponent(int requested_exponent,
+                                        DiyFp* power,
+                                        int* found_exponent);
+
+}  // namespace PowersOfTenCache
 
 }  // namespace double_conversion
+}  // namespace arrow_vendored
 
 #endif  // DOUBLE_CONVERSION_CACHED_POWERS_H_
diff --git a/include/arrow/vendored/double-conversion/diy-fp.h b/include/arrow/vendored/double-conversion/diy-fp.h
index 2edf346..f3367b9 100644
--- a/include/arrow/vendored/double-conversion/diy-fp.h
+++ b/include/arrow/vendored/double-conversion/diy-fp.h
@@ -30,42 +30,62 @@
 
 #include "utils.h"
 
+namespace arrow_vendored {
 namespace double_conversion {
 
 // This "Do It Yourself Floating Point" class implements a floating-point number
 // with a uint64 significand and an int exponent. Normalized DiyFp numbers will
 // have the most significant bit of the significand set.
 // Multiplication and Subtraction do not normalize their results.
-// DiyFp are not designed to contain special doubles (NaN and Infinity).
+// DiyFp store only non-negative numbers and are not designed to contain special
+// doubles (NaN and Infinity).
 class DiyFp {
  public:
   static const int kSignificandSize = 64;
 
   DiyFp() : f_(0), e_(0) {}
-  DiyFp(uint64_t significand, int exponent) : f_(significand), e_(exponent) {}
+  DiyFp(const uint64_t significand, const int32_t exponent) : f_(significand), e_(exponent) {}
 
-  // this = this - other.
+  // this -= other.
   // The exponents of both numbers must be the same and the significand of this
-  // must be bigger than the significand of other.
+  // must be greater or equal than the significand of other.
   // The result will not be normalized.
   void Subtract(const DiyFp& other) {
-    ASSERT(e_ == other.e_);
-    ASSERT(f_ >= other.f_);
+    DOUBLE_CONVERSION_ASSERT(e_ == other.e_);
+    DOUBLE_CONVERSION_ASSERT(f_ >= other.f_);
     f_ -= other.f_;
   }
 
   // Returns a - b.
-  // The exponents of both numbers must be the same and this must be bigger
-  // than other. The result will not be normalized.
+  // The exponents of both numbers must be the same and a must be greater
+  // or equal than b. The result will not be normalized.
   static DiyFp Minus(const DiyFp& a, const DiyFp& b) {
     DiyFp result = a;
     result.Subtract(b);
     return result;
   }
 
-
-  // this = this * other.
-  void Multiply(const DiyFp& other);
+  // this *= other.
+  void Multiply(const DiyFp& other) {
+    // Simply "emulates" a 128 bit multiplication.
+    // However: the resulting number only contains 64 bits. The least
+    // significant 64 bits are only used for rounding the most significant 64
+    // bits.
+    const uint64_t kM32 = 0xFFFFFFFFU;
+    const uint64_t a = f_ >> 32;
+    const uint64_t b = f_ & kM32;
+    const uint64_t c = other.f_ >> 32;
+    const uint64_t d = other.f_ & kM32;
+    const uint64_t ac = a * c;
+    const uint64_t bc = b * c;
+    const uint64_t ad = a * d;
+    const uint64_t bd = b * d;
+    // By adding 1U << 31 to tmp we round the final result.
+    // Halfway cases will be rounded up.
+    const uint64_t tmp = (bd >> 32) + (ad & kM32) + (bc & kM32) + (1U << 31);
+    e_ += other.e_ + 64;
+    f_ = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32);
+  }
 
   // returns a * b;
   static DiyFp Times(const DiyFp& a, const DiyFp& b) {
@@ -75,13 +95,13 @@ class DiyFp {
   }
 
   void Normalize() {
-    ASSERT(f_ != 0);
+    DOUBLE_CONVERSION_ASSERT(f_ != 0);
     uint64_t significand = f_;
-    int exponent = e_;
+    int32_t exponent = e_;
 
-    // This method is mainly called for normalizing boundaries. In general
-    // boundaries need to be shifted by 10 bits. We thus optimize for this case.
-    const uint64_t k10MSBits = UINT64_2PART_C(0xFFC00000, 00000000);
+    // This method is mainly called for normalizing boundaries. In general,
+    // boundaries need to be shifted by 10 bits, and we optimize for this case.
+    const uint64_t k10MSBits = DOUBLE_CONVERSION_UINT64_2PART_C(0xFFC00000, 00000000);
     while ((significand & k10MSBits) == 0) {
       significand <<= 10;
       exponent -= 10;
@@ -101,18 +121,19 @@ class DiyFp {
   }
 
   uint64_t f() const { return f_; }
-  int e() const { return e_; }
+  int32_t e() const { return e_; }
 
   void set_f(uint64_t new_value) { f_ = new_value; }
-  void set_e(int new_value) { e_ = new_value; }
+  void set_e(int32_t new_value) { e_ = new_value; }
 
  private:
-  static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000);
+  static const uint64_t kUint64MSB = DOUBLE_CONVERSION_UINT64_2PART_C(0x80000000, 00000000);
 
   uint64_t f_;
-  int e_;
+  int32_t e_;
 };
 
 }  // namespace double_conversion
+}  // namespace arrow_vendored
 
 #endif  // DOUBLE_CONVERSION_DIY_FP_H_
diff --git a/include/arrow/vendored/double-conversion/double-conversion.h b/include/arrow/vendored/double-conversion/double-conversion.h
index 9dc3ebd..6e8884d 100644
--- a/include/arrow/vendored/double-conversion/double-conversion.h
+++ b/include/arrow/vendored/double-conversion/double-conversion.h
@@ -28,560 +28,7 @@
 #ifndef DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_
 #define DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_
 
-#include "utils.h"
-
-namespace double_conversion {
-
-class DoubleToStringConverter {
- public:
-  // When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint
-  // or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the
-  // function returns false.
-  static const int kMaxFixedDigitsBeforePoint = 60;
-  static const int kMaxFixedDigitsAfterPoint = 60;
-
-  // When calling ToExponential with a requested_digits
-  // parameter > kMaxExponentialDigits then the function returns false.
-  static const int kMaxExponentialDigits = 120;
-
-  // When calling ToPrecision with a requested_digits
-  // parameter < kMinPrecisionDigits or requested_digits > kMaxPrecisionDigits
-  // then the function returns false.
-  static const int kMinPrecisionDigits = 1;
-  static const int kMaxPrecisionDigits = 120;
-
-  enum Flags {
-    NO_FLAGS = 0,
-    EMIT_POSITIVE_EXPONENT_SIGN = 1,
-    EMIT_TRAILING_DECIMAL_POINT = 2,
-    EMIT_TRAILING_ZERO_AFTER_POINT = 4,
-    UNIQUE_ZERO = 8
-  };
-
-  // Flags should be a bit-or combination of the possible Flags-enum.
-  //  - NO_FLAGS: no special flags.
-  //  - EMIT_POSITIVE_EXPONENT_SIGN: when the number is converted into exponent
-  //    form, emits a '+' for positive exponents. Example: 1.2e+2.
-  //  - EMIT_TRAILING_DECIMAL_POINT: when the input number is an integer and is
-  //    converted into decimal format then a trailing decimal point is appended.
-  //    Example: 2345.0 is converted to "2345.".
-  //  - EMIT_TRAILING_ZERO_AFTER_POINT: in addition to a trailing decimal point
-  //    emits a trailing '0'-character. This flag requires the
-  //    EXMIT_TRAILING_DECIMAL_POINT flag.
-  //    Example: 2345.0 is converted to "2345.0".
-  //  - UNIQUE_ZERO: "-0.0" is converted to "0.0".
-  //
-  // Infinity symbol and nan_symbol provide the string representation for these
-  // special values. If the string is NULL and the special value is encountered
-  // then the conversion functions return false.
-  //
-  // The exponent_character is used in exponential representations. It is
-  // usually 'e' or 'E'.
-  //
-  // When converting to the shortest representation the converter will
-  // represent input numbers in decimal format if they are in the interval
-  // [10^decimal_in_shortest_low; 10^decimal_in_shortest_high[
-  //    (lower boundary included, greater boundary excluded).
-  // Example: with decimal_in_shortest_low = -6 and
-  //               decimal_in_shortest_high = 21:
-  //   ToShortest(0.000001)  -> "0.000001"
-  //   ToShortest(0.0000001) -> "1e-7"
-  //   ToShortest(111111111111111111111.0)  -> "111111111111111110000"
-  //   ToShortest(100000000000000000000.0)  -> "100000000000000000000"
-  //   ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21"
-  //
-  // When converting to precision mode the converter may add
-  // max_leading_padding_zeroes before returning the number in exponential
-  // format.
-  // Example with max_leading_padding_zeroes_in_precision_mode = 6.
-  //   ToPrecision(0.0000012345, 2) -> "0.0000012"
-  //   ToPrecision(0.00000012345, 2) -> "1.2e-7"
-  // Similarily the converter may add up to
-  // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid
-  // returning an exponential representation. A zero added by the
-  // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit.
-  // Examples for max_trailing_padding_zeroes_in_precision_mode = 1:
-  //   ToPrecision(230.0, 2) -> "230"
-  //   ToPrecision(230.0, 2) -> "230."  with EMIT_TRAILING_DECIMAL_POINT.
-  //   ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT.
-  //
-  // When converting numbers to scientific notation representation, if the mantissa of
-  // the representation is an integer number, the EMIT_TRAILING_DECIMAL_POINT flag will
-  // add a '.' character at the end of the representation:
-  // - With EMIT_TRAILING_DECIMAL_POINT enabled -> 0.0009 => 9.E-4
-  // - With EMIT_TRAILING_DECIMAL_POINT disabled -> 0.0009 => 9E-4
-  //
-  // If the mantissa is an integer and the EMIT_TRAILING_ZERO_AFTER_POINT flag is enabled
-  // it will add a '0' character at the end of the mantissa representation. Note that that
-  // flag depends on EMIT_TRAILING_DECIMAL_POINT flag be enabled.
-  // - With EMIT_TRAILING_ZERO_AFTER_POINT enabled -> 0.0009 => 9.0E-4
-  DoubleToStringConverter(int flags,
-                          const char* infinity_symbol,
-                          const char* nan_symbol,
-                          char exponent_character,
-                          int decimal_in_shortest_low,
-                          int decimal_in_shortest_high,
-                          int max_leading_padding_zeroes_in_precision_mode,
-                          int max_trailing_padding_zeroes_in_precision_mode)
-      : flags_(flags),
-        infinity_symbol_(infinity_symbol),
-        nan_symbol_(nan_symbol),
-        exponent_character_(exponent_character),
-        decimal_in_shortest_low_(decimal_in_shortest_low),
-        decimal_in_shortest_high_(decimal_in_shortest_high),
-        max_leading_padding_zeroes_in_precision_mode_(
-            max_leading_padding_zeroes_in_precision_mode),
-        max_trailing_padding_zeroes_in_precision_mode_(
-            max_trailing_padding_zeroes_in_precision_mode) {
-    // When 'trailing zero after the point' is set, then 'trailing point'
-    // must be set too.
-    ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) ||
-        !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
-  }
-
-  // Returns a converter following the EcmaScript specification.
-  static const DoubleToStringConverter& EcmaScriptConverter();
-
-  // Computes the shortest string of digits that correctly represent the input
-  // number. Depending on decimal_in_shortest_low and decimal_in_shortest_high
-  // (see constructor) it then either returns a decimal representation, or an
-  // exponential representation.
-  // Example with decimal_in_shortest_low = -6,
-  //              decimal_in_shortest_high = 21,
-  //              EMIT_POSITIVE_EXPONENT_SIGN activated, and
-  //              EMIT_TRAILING_DECIMAL_POINT deactived:
-  //   ToShortest(0.000001)  -> "0.000001"
-  //   ToShortest(0.0000001) -> "1e-7"
-  //   ToShortest(111111111111111111111.0)  -> "111111111111111110000"
-  //   ToShortest(100000000000000000000.0)  -> "100000000000000000000"
-  //   ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21"
-  //
-  // Note: the conversion may round the output if the returned string
-  // is accurate enough to uniquely identify the input-number.
-  // For example the most precise representation of the double 9e59 equals
-  // "899999999999999918767229449717619953810131273674690656206848", but
-  // the converter will return the shorter (but still correct) "9e59".
-  //
-  // Returns true if the conversion succeeds. The conversion always succeeds
-  // except when the input value is special and no infinity_symbol or
-  // nan_symbol has been given to the constructor.
-  bool ToShortest(double value, StringBuilder* result_builder) const {
-    return ToShortestIeeeNumber(value, result_builder, SHORTEST);
-  }
-
-  // Same as ToShortest, but for single-precision floats.
-  bool ToShortestSingle(float value, StringBuilder* result_builder) const {
-    return ToShortestIeeeNumber(value, result_builder, SHORTEST_SINGLE);
-  }
-
-
-  // Computes a decimal representation with a fixed number of digits after the
-  // decimal point. The last emitted digit is rounded.
-  //
-  // Examples:
-  //   ToFixed(3.12, 1) -> "3.1"
-  //   ToFixed(3.1415, 3) -> "3.142"
-  //   ToFixed(1234.56789, 4) -> "1234.5679"
-  //   ToFixed(1.23, 5) -> "1.23000"
-  //   ToFixed(0.1, 4) -> "0.1000"
-  //   ToFixed(1e30, 2) -> "1000000000000000019884624838656.00"
-  //   ToFixed(0.1, 30) -> "0.100000000000000005551115123126"
-  //   ToFixed(0.1, 17) -> "0.10000000000000001"
-  //
-  // If requested_digits equals 0, then the tail of the result depends on
-  // the EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT.
-  // Examples, for requested_digits == 0,
-  //   let EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT be
-  //    - false and false: then 123.45 -> 123
-  //                             0.678 -> 1
-  //    - true and false: then 123.45 -> 123.
-  //                            0.678 -> 1.
-  //    - true and true: then 123.45 -> 123.0
-  //                           0.678 -> 1.0
-  //
-  // Returns true if the conversion succeeds. The conversion always succeeds
-  // except for the following cases:
-  //   - the input value is special and no infinity_symbol or nan_symbol has
-  //     been provided to the constructor,
-  //   - 'value' > 10^kMaxFixedDigitsBeforePoint, or
-  //   - 'requested_digits' > kMaxFixedDigitsAfterPoint.
-  // The last two conditions imply that the result will never contain more than
-  // 1 + kMaxFixedDigitsBeforePoint + 1 + kMaxFixedDigitsAfterPoint characters
-  // (one additional character for the sign, and one for the decimal point).
-  bool ToFixed(double value,
-               int requested_digits,
-               StringBuilder* result_builder) const;
-
-  // Computes a representation in exponential format with requested_digits
-  // after the decimal point. The last emitted digit is rounded.
-  // If requested_digits equals -1, then the shortest exponential representation
-  // is computed.
-  //
-  // Examples with EMIT_POSITIVE_EXPONENT_SIGN deactivated, and
-  //               exponent_character set to 'e'.
-  //   ToExponential(3.12, 1) -> "3.1e0"
-  //   ToExponential(5.0, 3) -> "5.000e0"
-  //   ToExponential(0.001, 2) -> "1.00e-3"
-  //   ToExponential(3.1415, -1) -> "3.1415e0"
-  //   ToExponential(3.1415, 4) -> "3.1415e0"
-  //   ToExponential(3.1415, 3) -> "3.142e0"
-  //   ToExponential(123456789000000, 3) -> "1.235e14"
-  //   ToExponential(1000000000000000019884624838656.0, -1) -> "1e30"
-  //   ToExponential(1000000000000000019884624838656.0, 32) ->
-  //                     "1.00000000000000001988462483865600e30"
-  //   ToExponential(1234, 0) -> "1e3"
-  //
-  // Returns true if the conversion succeeds. The conversion always succeeds
-  // except for the following cases:
-  //   - the input value is special and no infinity_symbol or nan_symbol has
-  //     been provided to the constructor,
-  //   - 'requested_digits' > kMaxExponentialDigits.
-  // The last condition implies that the result will never contain more than
-  // kMaxExponentialDigits + 8 characters (the sign, the digit before the
-  // decimal point, the decimal point, the exponent character, the
-  // exponent's sign, and at most 3 exponent digits).
-  bool ToExponential(double value,
-                     int requested_digits,
-                     StringBuilder* result_builder) const;
-
-  // Computes 'precision' leading digits of the given 'value' and returns them
-  // either in exponential or decimal format, depending on
-  // max_{leading|trailing}_padding_zeroes_in_precision_mode (given to the
-  // constructor).
-  // The last computed digit is rounded.
-  //
-  // Example with max_leading_padding_zeroes_in_precision_mode = 6.
-  //   ToPrecision(0.0000012345, 2) -> "0.0000012"
-  //   ToPrecision(0.00000012345, 2) -> "1.2e-7"
-  // Similarily the converter may add up to
-  // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid
-  // returning an exponential representation. A zero added by the
-  // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit.
-  // Examples for max_trailing_padding_zeroes_in_precision_mode = 1:
-  //   ToPrecision(230.0, 2) -> "230"
-  //   ToPrecision(230.0, 2) -> "230."  with EMIT_TRAILING_DECIMAL_POINT.
-  //   ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT.
-  // Examples for max_trailing_padding_zeroes_in_precision_mode = 3, and no
-  //    EMIT_TRAILING_ZERO_AFTER_POINT:
-  //   ToPrecision(123450.0, 6) -> "123450"
-  //   ToPrecision(123450.0, 5) -> "123450"
-  //   ToPrecision(123450.0, 4) -> "123500"
-  //   ToPrecision(123450.0, 3) -> "123000"
-  //   ToPrecision(123450.0, 2) -> "1.2e5"
-  //
-  // Returns true if the conversion succeeds. The conversion always succeeds
-  // except for the following cases:
-  //   - the input value is special and no infinity_symbol or nan_symbol has
-  //     been provided to the constructor,
-  //   - precision < kMinPericisionDigits
-  //   - precision > kMaxPrecisionDigits
-  // The last condition implies that the result will never contain more than
-  // kMaxPrecisionDigits + 7 characters (the sign, the decimal point, the
-  // exponent character, the exponent's sign, and at most 3 exponent digits).
-  bool ToPrecision(double value,
-                   int precision,
-                   StringBuilder* result_builder) const;
-
-  enum DtoaMode {
-    // Produce the shortest correct representation.
-    // For example the output of 0.299999999999999988897 is (the less accurate
-    // but correct) 0.3.
-    SHORTEST,
-    // Same as SHORTEST, but for single-precision floats.
-    SHORTEST_SINGLE,
-    // Produce a fixed number of digits after the decimal point.
-    // For instance fixed(0.1, 4) becomes 0.1000
-    // If the input number is big, the output will be big.
-    FIXED,
-    // Fixed number of digits (independent of the decimal point).
-    PRECISION
-  };
-
-  // The maximal number of digits that are needed to emit a double in base 10.
-  // A higher precision can be achieved by using more digits, but the shortest
-  // accurate representation of any double will never use more digits than
-  // kBase10MaximalLength.
-  // Note that DoubleToAscii null-terminates its input. So the given buffer
-  // should be at least kBase10MaximalLength + 1 characters long.
-  static const int kBase10MaximalLength = 17;
-
-  // Converts the given double 'v' to digit characters. 'v' must not be NaN,
-  // +Infinity, or -Infinity. In SHORTEST_SINGLE-mode this restriction also
-  // applies to 'v' after it has been casted to a single-precision float. That
-  // is, in this mode static_cast<float>(v) must not be NaN, +Infinity or
-  // -Infinity.
-  //
-  // The result should be interpreted as buffer * 10^(point-length).
-  //
-  // The digits are written to the buffer in the platform's charset, which is
-  // often UTF-8 (with ASCII-range digits) but may be another charset, such
-  // as EBCDIC.
-  //
-  // The output depends on the given mode:
-  //  - SHORTEST: produce the least amount of digits for which the internal
-  //   identity requirement is still satisfied. If the digits are printed
-  //   (together with the correct exponent) then reading this number will give
-  //   'v' again. The buffer will choose the representation that is closest to
-  //   'v'. If there are two at the same distance, than the one farther away
-  //   from 0 is chosen (halfway cases - ending with 5 - are rounded up).
-  //   In this mode the 'requested_digits' parameter is ignored.
-  //  - SHORTEST_SINGLE: same as SHORTEST but with single-precision.
-  //  - FIXED: produces digits necessary to print a given number with
-  //   'requested_digits' digits after the decimal point. The produced digits
-  //   might be too short in which case the caller has to fill the remainder
-  //   with '0's.
-  //   Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2.
-  //   Halfway cases are rounded towards +/-Infinity (away from 0). The call
-  //   toFixed(0.15, 2) thus returns buffer="2", point=0.
-  //   The returned buffer may contain digits that would be truncated from the
-  //   shortest representation of the input.
-  //  - PRECISION: produces 'requested_digits' where the first digit is not '0'.
-  //   Even though the length of produced digits usually equals
-  //   'requested_digits', the function is allowed to return fewer digits, in
-  //   which case the caller has to fill the missing digits with '0's.
-  //   Halfway cases are again rounded away from 0.
-  // DoubleToAscii expects the given buffer to be big enough to hold all
-  // digits and a terminating null-character. In SHORTEST-mode it expects a
-  // buffer of at least kBase10MaximalLength + 1. In all other modes the
-  // requested_digits parameter and the padding-zeroes limit the size of the
-  // output. Don't forget the decimal point, the exponent character and the
-  // terminating null-character when computing the maximal output size.
-  // The given length is only used in debug mode to ensure the buffer is big
-  // enough.
-  static void DoubleToAscii(double v,
-                            DtoaMode mode,
-                            int requested_digits,
-                            char* buffer,
-                            int buffer_length,
-                            bool* sign,
-                            int* length,
-                            int* point);
-
- private:
-  // Implementation for ToShortest and ToShortestSingle.
-  bool ToShortestIeeeNumber(double value,
-                            StringBuilder* result_builder,
-                            DtoaMode mode) const;
-
-  // If the value is a special value (NaN or Infinity) constructs the
-  // corresponding string using the configured infinity/nan-symbol.
-  // If either of them is NULL or the value is not special then the
-  // function returns false.
-  bool HandleSpecialValues(double value, StringBuilder* result_builder) const;
-  // Constructs an exponential representation (i.e. 1.234e56).
-  // The given exponent assumes a decimal point after the first decimal digit.
-  void CreateExponentialRepresentation(const char* decimal_digits,
-                                       int length,
-                                       int exponent,
-                                       StringBuilder* result_builder) const;
-  // Creates a decimal representation (i.e 1234.5678).
-  void CreateDecimalRepresentation(const char* decimal_digits,
-                                   int length,
-                                   int decimal_point,
-                                   int digits_after_point,
-                                   StringBuilder* result_builder) const;
-
-  const int flags_;
-  const char* const infinity_symbol_;
-  const char* const nan_symbol_;
-  const char exponent_character_;
-  const int decimal_in_shortest_low_;
-  const int decimal_in_shortest_high_;
-  const int max_leading_padding_zeroes_in_precision_mode_;
-  const int max_trailing_padding_zeroes_in_precision_mode_;
-
-  DC_DISALLOW_IMPLICIT_CONSTRUCTORS(DoubleToStringConverter);
-};
-
-
-class StringToDoubleConverter {
- public:
-  // Enumeration for allowing octals and ignoring junk when converting
-  // strings to numbers.
-  enum Flags {
-    NO_FLAGS = 0,
-    ALLOW_HEX = 1,
-    ALLOW_OCTALS = 2,
-    ALLOW_TRAILING_JUNK = 4,
-    ALLOW_LEADING_SPACES = 8,
-    ALLOW_TRAILING_SPACES = 16,
-    ALLOW_SPACES_AFTER_SIGN = 32,
-    ALLOW_CASE_INSENSIBILITY = 64,
-    ALLOW_HEX_FLOATS = 128,
-  };
-
-  static const uc16 kNoSeparator = '\0';
-
-  // Flags should be a bit-or combination of the possible Flags-enum.
-  //  - NO_FLAGS: no special flags.
-  //  - ALLOW_HEX: recognizes the prefix "0x". Hex numbers may only be integers.
-  //      Ex: StringToDouble("0x1234") -> 4660.0
-  //          In StringToDouble("0x1234.56") the characters ".56" are trailing
-  //          junk. The result of the call is hence dependent on
-  //          the ALLOW_TRAILING_JUNK flag and/or the junk value.
-  //      With this flag "0x" is a junk-string. Even with ALLOW_TRAILING_JUNK,
-  //      the string will not be parsed as "0" followed by junk.
-  //
-  //  - ALLOW_OCTALS: recognizes the prefix "0" for octals:
-  //      If a sequence of octal digits starts with '0', then the number is
-  //      read as octal integer. Octal numbers may only be integers.
-  //      Ex: StringToDouble("01234") -> 668.0
-  //          StringToDouble("012349") -> 12349.0  // Not a sequence of octal
-  //                                               // digits.
-  //          In StringToDouble("01234.56") the characters ".56" are trailing
-  //          junk. The result of the call is hence dependent on
-  //          the ALLOW_TRAILING_JUNK flag and/or the junk value.
-  //          In StringToDouble("01234e56") the characters "e56" are trailing
-  //          junk, too.
-  //  - ALLOW_TRAILING_JUNK: ignore trailing characters that are not part of
-  //      a double literal.
-  //  - ALLOW_LEADING_SPACES: skip over leading whitespace, including spaces,
-  //                          new-lines, and tabs.
-  //  - ALLOW_TRAILING_SPACES: ignore trailing whitespace.
-  //  - ALLOW_SPACES_AFTER_SIGN: ignore whitespace after the sign.
-  //       Ex: StringToDouble("-   123.2") -> -123.2.
-  //           StringToDouble("+   123.2") -> 123.2
-  //  - ALLOW_CASE_INSENSIBILITY: ignore case of characters for special values:
-  //      infinity and nan.
-  //  - ALLOW_HEX_FLOATS: allows hexadecimal float literals.
-  //      This *must* start with "0x" and separate the exponent with "p".
-  //      Examples: 0x1.2p3 == 9.0
-  //                0x10.1p0 == 16.0625
-  //      ALLOW_HEX and ALLOW_HEX_FLOATS are indendent.
-  //
-  // empty_string_value is returned when an empty string is given as input.
-  // If ALLOW_LEADING_SPACES or ALLOW_TRAILING_SPACES are set, then a string
-  // containing only spaces is converted to the 'empty_string_value', too.
-  //
-  // junk_string_value is returned when
-  //  a) ALLOW_TRAILING_JUNK is not set, and a junk character (a character not
-  //     part of a double-literal) is found.
-  //  b) ALLOW_TRAILING_JUNK is set, but the string does not start with a
-  //     double literal.
-  //
-  // infinity_symbol and nan_symbol are strings that are used to detect
-  // inputs that represent infinity and NaN. They can be null, in which case
-  // they are ignored.
-  // The conversion routine first reads any possible signs. Then it compares the
-  // following character of the input-string with the first character of
-  // the infinity, and nan-symbol. If either matches, the function assumes, that
-  // a match has been found, and expects the following input characters to match
-  // the remaining characters of the special-value symbol.
-  // This means that the following restrictions apply to special-value symbols:
-  //  - they must not start with signs ('+', or '-'),
-  //  - they must not have the same first character.
-  //  - they must not start with digits.
-  //
-  // If the separator character is not kNoSeparator, then that specific
-  // character is ignored when in between two valid digits of the significant.
-  // It is not allowed to appear in the exponent.
-  // It is not allowed to lead or trail the number.
-  // It is not allowed to appear twice next to each other.
-  //
-  // Examples:
-  //  flags = ALLOW_HEX | ALLOW_TRAILING_JUNK,
-  //  empty_string_value = 0.0,
-  //  junk_string_value = NaN,
-  //  infinity_symbol = "infinity",
-  //  nan_symbol = "nan":
-  //    StringToDouble("0x1234") -> 4660.0.
-  //    StringToDouble("0x1234K") -> 4660.0.
-  //    StringToDouble("") -> 0.0  // empty_string_value.
-  //    StringToDouble(" ") -> NaN  // junk_string_value.
-  //    StringToDouble(" 1") -> NaN  // junk_string_value.
-  //    StringToDouble("0x") -> NaN  // junk_string_value.
-  //    StringToDouble("-123.45") -> -123.45.
-  //    StringToDouble("--123.45") -> NaN  // junk_string_value.
-  //    StringToDouble("123e45") -> 123e45.
-  //    StringToDouble("123E45") -> 123e45.
-  //    StringToDouble("123e+45") -> 123e45.
-  //    StringToDouble("123E-45") -> 123e-45.
-  //    StringToDouble("123e") -> 123.0  // trailing junk ignored.
-  //    StringToDouble("123e-") -> 123.0  // trailing junk ignored.
-  //    StringToDouble("+NaN") -> NaN  // NaN string literal.
-  //    StringToDouble("-infinity") -> -inf.  // infinity literal.
-  //    StringToDouble("Infinity") -> NaN  // junk_string_value.
-  //
-  //  flags = ALLOW_OCTAL | ALLOW_LEADING_SPACES,
-  //  empty_string_value = 0.0,
-  //  junk_string_value = NaN,
-  //  infinity_symbol = NULL,
-  //  nan_symbol = NULL:
-  //    StringToDouble("0x1234") -> NaN  // junk_string_value.
-  //    StringToDouble("01234") -> 668.0.
-  //    StringToDouble("") -> 0.0  // empty_string_value.
-  //    StringToDouble(" ") -> 0.0  // empty_string_value.
-  //    StringToDouble(" 1") -> 1.0
-  //    StringToDouble("0x") -> NaN  // junk_string_value.
-  //    StringToDouble("0123e45") -> NaN  // junk_string_value.
-  //    StringToDouble("01239E45") -> 1239e45.
-  //    StringToDouble("-infinity") -> NaN  // junk_string_value.
-  //    StringToDouble("NaN") -> NaN  // junk_string_value.
-  //
-  //  flags = NO_FLAGS,
-  //  separator = ' ':
-  //    StringToDouble("1 2 3 4") -> 1234.0
-  //    StringToDouble("1  2") -> NaN // junk_string_value
-  //    StringToDouble("1 000 000.0") -> 1000000.0
-  //    StringToDouble("1.000 000") -> 1.0
-  //    StringToDouble("1.0e1 000") -> NaN // junk_string_value
-  StringToDoubleConverter(int flags,
-                          double empty_string_value,
-                          double junk_string_value,
-                          const char* infinity_symbol,
-                          const char* nan_symbol,
-                          uc16 separator = kNoSeparator)
-      : flags_(flags),
-        empty_string_value_(empty_string_value),
-        junk_string_value_(junk_string_value),
-        infinity_symbol_(infinity_symbol),
-        nan_symbol_(nan_symbol),
-        separator_(separator) {
-  }
-
-  // Performs the conversion.
-  // The output parameter 'processed_characters_count' is set to the number
-  // of characters that have been processed to read the number.
-  // Spaces than are processed with ALLOW_{LEADING|TRAILING}_SPACES are included
-  // in the 'processed_characters_count'. Trailing junk is never included.
-  double StringToDouble(const char* buffer,
-                        int length,
-                        int* processed_characters_count) const;
-
-  // Same as StringToDouble above but for 16 bit characters.
-  double StringToDouble(const uc16* buffer,
-                        int length,
-                        int* processed_characters_count) const;
-
-  // Same as StringToDouble but reads a float.
-  // Note that this is not equivalent to static_cast<float>(StringToDouble(...))
-  // due to potential double-rounding.
-  float StringToFloat(const char* buffer,
-                      int length,
-                      int* processed_characters_count) const;
-
-  // Same as StringToFloat above but for 16 bit characters.
-  float StringToFloat(const uc16* buffer,
-                      int length,
-                      int* processed_characters_count) const;
-
- private:
-  const int flags_;
-  const double empty_string_value_;
-  const double junk_string_value_;
-  const char* const infinity_symbol_;
-  const char* const nan_symbol_;
-  const uc16 separator_;
-
-  template <class Iterator>
-  double StringToIeee(Iterator start_pointer,
-                      int length,
-                      bool read_as_double,
-                      int* processed_characters_count) const;
-
-  DC_DISALLOW_IMPLICIT_CONSTRUCTORS(StringToDoubleConverter);
-};
-
-}  // namespace double_conversion
+#include "string-to-double.h"
+#include "double-to-string.h"
 
 #endif  // DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_
diff --git a/include/arrow/vendored/double-conversion/double-to-string.h b/include/arrow/vendored/double-conversion/double-to-string.h
new file mode 100644
index 0000000..90a88b9
--- /dev/null
+++ b/include/arrow/vendored/double-conversion/double-to-string.h
@@ -0,0 +1,472 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_DOUBLE_TO_STRING_H_
+#define DOUBLE_CONVERSION_DOUBLE_TO_STRING_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+class DoubleToStringConverter {
+ public:
+  // When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint
+  // or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the
+  // function returns false.
+  static const int kMaxFixedDigitsBeforePoint = 60;
+  static const int kMaxFixedDigitsAfterPoint = 100;
+
+  // When calling ToExponential with a requested_digits
+  // parameter > kMaxExponentialDigits then the function returns false.
+  static const int kMaxExponentialDigits = 120;
+
+  // When calling ToPrecision with a requested_digits
+  // parameter < kMinPrecisionDigits or requested_digits > kMaxPrecisionDigits
+  // then the function returns false.
+  static const int kMinPrecisionDigits = 1;
+  static const int kMaxPrecisionDigits = 120;
+
+  // The maximal number of digits that are needed to emit a double in base 10.
+  // A higher precision can be achieved by using more digits, but the shortest
+  // accurate representation of any double will never use more digits than
+  // kBase10MaximalLength.
+  // Note that DoubleToAscii null-terminates its input. So the given buffer
+  // should be at least kBase10MaximalLength + 1 characters long.
+  static const int kBase10MaximalLength = 17;
+
+  // The maximal number of digits that are needed to emit a single in base 10.
+  // A higher precision can be achieved by using more digits, but the shortest
+  // accurate representation of any single will never use more digits than
+  // kBase10MaximalLengthSingle.
+  static const int kBase10MaximalLengthSingle = 9;
+
+  // The length of the longest string that 'ToShortest' can produce when the
+  // converter is instantiated with EcmaScript defaults (see
+  // 'EcmaScriptConverter')
+  // This value does not include the trailing '\0' character.
+  // This amount of characters is needed for negative values that hit the
+  // 'decimal_in_shortest_low' limit. For example: "-0.0000033333333333333333"
+  static const int kMaxCharsEcmaScriptShortest = 25;
+
+  enum Flags {
+    NO_FLAGS = 0,
+    EMIT_POSITIVE_EXPONENT_SIGN = 1,
+    EMIT_TRAILING_DECIMAL_POINT = 2,
+    EMIT_TRAILING_ZERO_AFTER_POINT = 4,
+    UNIQUE_ZERO = 8,
+    NO_TRAILING_ZERO = 16,
+    EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL = 32,
+    EMIT_TRAILING_ZERO_AFTER_POINT_IN_EXPONENTIAL = 64
+  };
+
+  // Flags should be a bit-or combination of the possible Flags-enum.
+  //  - NO_FLAGS: no special flags.
+  //  - EMIT_POSITIVE_EXPONENT_SIGN: when the number is converted into exponent
+  //    form, emits a '+' for positive exponents. Example: 1.2e+2.
+  //  - EMIT_TRAILING_DECIMAL_POINT: when the input number is an integer and is
+  //    converted into decimal format then a trailing decimal point is appended.
+  //    Example: 2345.0 is converted to "2345.".
+  //  - EMIT_TRAILING_ZERO_AFTER_POINT: in addition to a trailing decimal point
+  //    emits a trailing '0'-character. This flag requires the
+  //    EMIT_TRAILING_DECIMAL_POINT flag.
+  //    Example: 2345.0 is converted to "2345.0".
+  //  - UNIQUE_ZERO: "-0.0" is converted to "0.0".
+  //  - NO_TRAILING_ZERO: Trailing zeros are removed from the fractional portion
+  //    of the result in precision mode. Matches printf's %g.
+  //    When EMIT_TRAILING_ZERO_AFTER_POINT is also given, one trailing zero is
+  //    preserved.
+  //  - EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL: when the input number has
+  //    exactly one significant digit and is converted into exponent form then a
+  //    trailing decimal point is appended to the significand in shortest mode
+  //    or in precision mode with one requested digit.
+  //  - EMIT_TRAILING_ZERO_AFTER_POINT_IN_EXPONENTIAL: in addition to a trailing
+  //    decimal point emits a trailing '0'-character. This flag requires the
+  //    EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL flag.
+  //
+  // Infinity symbol and nan_symbol provide the string representation for these
+  // special values. If the string is NULL and the special value is encountered
+  // then the conversion functions return false.
+  //
+  // The exponent_character is used in exponential representations. It is
+  // usually 'e' or 'E'.
+  //
+  // When converting to the shortest representation the converter will
+  // represent input numbers in decimal format if they are in the interval
+  // [10^decimal_in_shortest_low; 10^decimal_in_shortest_high[
+  //    (lower boundary included, greater boundary excluded).
+  // Example: with decimal_in_shortest_low = -6 and
+  //               decimal_in_shortest_high = 21:
+  //   ToShortest(0.000001)  -> "0.000001"
+  //   ToShortest(0.0000001) -> "1e-7"
+  //   ToShortest(111111111111111111111.0)  -> "111111111111111110000"
+  //   ToShortest(100000000000000000000.0)  -> "100000000000000000000"
+  //   ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21"
+  //
+  // When converting to precision mode the converter may add
+  // max_leading_padding_zeroes before returning the number in exponential
+  // format.
+  // Example with max_leading_padding_zeroes_in_precision_mode = 6.
+  //   ToPrecision(0.0000012345, 2) -> "0.0000012"
+  //   ToPrecision(0.00000012345, 2) -> "1.2e-7"
+  // Similarly the converter may add up to
+  // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid
+  // returning an exponential representation. A zero added by the
+  // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit.
+  // Examples for max_trailing_padding_zeroes_in_precision_mode = 1:
+  //   ToPrecision(230.0, 2) -> "230"
+  //   ToPrecision(230.0, 2) -> "230."  with EMIT_TRAILING_DECIMAL_POINT.
+  //   ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT.
+  //
+  // When converting numbers with exactly one significant digit to exponent
+  // form in shortest mode or in precision mode with one requested digit, the
+  // EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT flags have
+  // no effect. Use the EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL flag to
+  // append a decimal point in this case and the
+  // EMIT_TRAILING_ZERO_AFTER_POINT_IN_EXPONENTIAL flag to also append a
+  // '0'-character in this case.
+  // Example with decimal_in_shortest_low = 0:
+  //   ToShortest(0.0009) -> "9e-4"
+  //     with EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL deactivated.
+  //   ToShortest(0.0009) -> "9.e-4"
+  //     with EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL activated.
+  //   ToShortest(0.0009) -> "9.0e-4"
+  //     with EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL activated and
+  //     EMIT_TRAILING_ZERO_AFTER_POINT_IN_EXPONENTIAL activated.
+  //
+  // The min_exponent_width is used for exponential representations.
+  // The converter adds leading '0's to the exponent until the exponent
+  // is at least min_exponent_width digits long.
+  // The min_exponent_width is clamped to 5.
+  // As such, the exponent may never have more than 5 digits in total.
+  DoubleToStringConverter(int flags,
+                          const char* infinity_symbol,
+                          const char* nan_symbol,
+                          char exponent_character,
+                          int decimal_in_shortest_low,
+                          int decimal_in_shortest_high,
+                          int max_leading_padding_zeroes_in_precision_mode,
+                          int max_trailing_padding_zeroes_in_precision_mode,
+                          int min_exponent_width = 0)
+      : flags_(flags),
+        infinity_symbol_(infinity_symbol),
+        nan_symbol_(nan_symbol),
+        exponent_character_(exponent_character),
+        decimal_in_shortest_low_(decimal_in_shortest_low),
+        decimal_in_shortest_high_(decimal_in_shortest_high),
+        max_leading_padding_zeroes_in_precision_mode_(
+            max_leading_padding_zeroes_in_precision_mode),
+        max_trailing_padding_zeroes_in_precision_mode_(
+            max_trailing_padding_zeroes_in_precision_mode),
+        min_exponent_width_(min_exponent_width) {
+    // When 'trailing zero after the point' is set, then 'trailing point'
+    // must be set too.
+    DOUBLE_CONVERSION_ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) ||
+        !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
+  }
+
+  // Returns a converter following the EcmaScript specification.
+  //
+  // Flags: UNIQUE_ZERO and EMIT_POSITIVE_EXPONENT_SIGN.
+  // Special values: "Infinity" and "NaN".
+  // Lower case 'e' for exponential values.
+  // decimal_in_shortest_low: -6
+  // decimal_in_shortest_high: 21
+  // max_leading_padding_zeroes_in_precision_mode: 6
+  // max_trailing_padding_zeroes_in_precision_mode: 0
+  static const DoubleToStringConverter& EcmaScriptConverter();
+
+  // Computes the shortest string of digits that correctly represent the input
+  // number. Depending on decimal_in_shortest_low and decimal_in_shortest_high
+  // (see constructor) it then either returns a decimal representation, or an
+  // exponential representation.
+  // Example with decimal_in_shortest_low = -6,
+  //              decimal_in_shortest_high = 21,
+  //              EMIT_POSITIVE_EXPONENT_SIGN activated, and
+  //              EMIT_TRAILING_DECIMAL_POINT deactivated:
+  //   ToShortest(0.000001)  -> "0.000001"
+  //   ToShortest(0.0000001) -> "1e-7"
+  //   ToShortest(111111111111111111111.0)  -> "111111111111111110000"
+  //   ToShortest(100000000000000000000.0)  -> "100000000000000000000"
+  //   ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21"
+  //
+  // Note: the conversion may round the output if the returned string
+  // is accurate enough to uniquely identify the input-number.
+  // For example the most precise representation of the double 9e59 equals
+  // "899999999999999918767229449717619953810131273674690656206848", but
+  // the converter will return the shorter (but still correct) "9e59".
+  //
+  // Returns true if the conversion succeeds. The conversion always succeeds
+  // except when the input value is special and no infinity_symbol or
+  // nan_symbol has been given to the constructor.
+  //
+  // The length of the longest result is the maximum of the length of the
+  // following string representations (each with possible examples):
+  // - NaN and negative infinity: "NaN", "-Infinity", "-inf".
+  // - -10^(decimal_in_shortest_high - 1):
+  //      "-100000000000000000000", "-1000000000000000.0"
+  // - the longest string in range [0; -10^decimal_in_shortest_low]. Generally,
+  //   this string is 3 + kBase10MaximalLength - decimal_in_shortest_low.
+  //   (Sign, '0', decimal point, padding zeroes for decimal_in_shortest_low,
+  //   and the significant digits).
+  //      "-0.0000033333333333333333", "-0.0012345678901234567"
+  // - the longest exponential representation. (A negative number with
+  //   kBase10MaximalLength significant digits).
+  //      "-1.7976931348623157e+308", "-1.7976931348623157E308"
+  // In addition, the buffer must be able to hold the trailing '\0' character.
+  bool ToShortest(double value, StringBuilder* result_builder) const {
+    return ToShortestIeeeNumber(value, result_builder, SHORTEST);
+  }
+
+  // Same as ToShortest, but for single-precision floats.
+  bool ToShortestSingle(float value, StringBuilder* result_builder) const {
+    return ToShortestIeeeNumber(value, result_builder, SHORTEST_SINGLE);
+  }
+
+
+  // Computes a decimal representation with a fixed number of digits after the
+  // decimal point. The last emitted digit is rounded.
+  //
+  // Examples:
+  //   ToFixed(3.12, 1) -> "3.1"
+  //   ToFixed(3.1415, 3) -> "3.142"
+  //   ToFixed(1234.56789, 4) -> "1234.5679"
+  //   ToFixed(1.23, 5) -> "1.23000"
+  //   ToFixed(0.1, 4) -> "0.1000"
+  //   ToFixed(1e30, 2) -> "1000000000000000019884624838656.00"
+  //   ToFixed(0.1, 30) -> "0.100000000000000005551115123126"
+  //   ToFixed(0.1, 17) -> "0.10000000000000001"
+  //
+  // If requested_digits equals 0, then the tail of the result depends on
+  // the EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT.
+  // Examples, for requested_digits == 0,
+  //   let EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT be
+  //    - false and false: then 123.45 -> 123
+  //                             0.678 -> 1
+  //    - true and false: then 123.45 -> 123.
+  //                            0.678 -> 1.
+  //    - true and true: then 123.45 -> 123.0
+  //                           0.678 -> 1.0
+  //
+  // Returns true if the conversion succeeds. The conversion always succeeds
+  // except for the following cases:
+  //   - the input value is special and no infinity_symbol or nan_symbol has
+  //     been provided to the constructor,
+  //   - 'value' > 10^kMaxFixedDigitsBeforePoint, or
+  //   - 'requested_digits' > kMaxFixedDigitsAfterPoint.
+  // The last two conditions imply that the result for non-special values never
+  // contains more than
+  //  1 + kMaxFixedDigitsBeforePoint + 1 + kMaxFixedDigitsAfterPoint characters
+  // (one additional character for the sign, and one for the decimal point).
+  // In addition, the buffer must be able to hold the trailing '\0' character.
+  bool ToFixed(double value,
+               int requested_digits,
+               StringBuilder* result_builder) const;
+
+  // Computes a representation in exponential format with requested_digits
+  // after the decimal point. The last emitted digit is rounded.
+  // If requested_digits equals -1, then the shortest exponential representation
+  // is computed.
+  //
+  // Examples with EMIT_POSITIVE_EXPONENT_SIGN deactivated, and
+  //               exponent_character set to 'e'.
+  //   ToExponential(3.12, 1) -> "3.1e0"
+  //   ToExponential(5.0, 3) -> "5.000e0"
+  //   ToExponential(0.001, 2) -> "1.00e-3"
+  //   ToExponential(3.1415, -1) -> "3.1415e0"
+  //   ToExponential(3.1415, 4) -> "3.1415e0"
+  //   ToExponential(3.1415, 3) -> "3.142e0"
+  //   ToExponential(123456789000000, 3) -> "1.235e14"
+  //   ToExponential(1000000000000000019884624838656.0, -1) -> "1e30"
+  //   ToExponential(1000000000000000019884624838656.0, 32) ->
+  //                     "1.00000000000000001988462483865600e30"
+  //   ToExponential(1234, 0) -> "1e3"
+  //
+  // Returns true if the conversion succeeds. The conversion always succeeds
+  // except for the following cases:
+  //   - the input value is special and no infinity_symbol or nan_symbol has
+  //     been provided to the constructor,
+  //   - 'requested_digits' > kMaxExponentialDigits.
+  //
+  // The last condition implies that the result never contains more than
+  // kMaxExponentialDigits + 8 characters (the sign, the digit before the
+  // decimal point, the decimal point, the exponent character, the
+  // exponent's sign, and at most 3 exponent digits).
+  // In addition, the buffer must be able to hold the trailing '\0' character.
+  bool ToExponential(double value,
+                     int requested_digits,
+                     StringBuilder* result_builder) const;
+
+
+  // Computes 'precision' leading digits of the given 'value' and returns them
+  // either in exponential or decimal format, depending on
+  // max_{leading|trailing}_padding_zeroes_in_precision_mode (given to the
+  // constructor).
+  // The last computed digit is rounded.
+  //
+  // Example with max_leading_padding_zeroes_in_precision_mode = 6.
+  //   ToPrecision(0.0000012345, 2) -> "0.0000012"
+  //   ToPrecision(0.00000012345, 2) -> "1.2e-7"
+  // Similarly the converter may add up to
+  // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid
+  // returning an exponential representation. A zero added by the
+  // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit.
+  // Examples for max_trailing_padding_zeroes_in_precision_mode = 1:
+  //   ToPrecision(230.0, 2) -> "230"
+  //   ToPrecision(230.0, 2) -> "230."  with EMIT_TRAILING_DECIMAL_POINT.
+  //   ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT.
+  // Examples for max_trailing_padding_zeroes_in_precision_mode = 3, and no
+  //    EMIT_TRAILING_ZERO_AFTER_POINT:
+  //   ToPrecision(123450.0, 6) -> "123450"
+  //   ToPrecision(123450.0, 5) -> "123450"
+  //   ToPrecision(123450.0, 4) -> "123500"
+  //   ToPrecision(123450.0, 3) -> "123000"
+  //   ToPrecision(123450.0, 2) -> "1.2e5"
+  //
+  // Returns true if the conversion succeeds. The conversion always succeeds
+  // except for the following cases:
+  //   - the input value is special and no infinity_symbol or nan_symbol has
+  //     been provided to the constructor,
+  //   - precision < kMinPericisionDigits
+  //   - precision > kMaxPrecisionDigits
+  //
+  // The last condition implies that the result never contains more than
+  // kMaxPrecisionDigits + 7 characters (the sign, the decimal point, the
+  // exponent character, the exponent's sign, and at most 3 exponent digits).
+  // In addition, the buffer must be able to hold the trailing '\0' character.
+  bool ToPrecision(double value,
+                   int precision,
+                   StringBuilder* result_builder) const;
+
+  enum DtoaMode {
+    // Produce the shortest correct representation.
+    // For example the output of 0.299999999999999988897 is (the less accurate
+    // but correct) 0.3.
+    SHORTEST,
+    // Same as SHORTEST, but for single-precision floats.
+    SHORTEST_SINGLE,
+    // Produce a fixed number of digits after the decimal point.
+    // For instance fixed(0.1, 4) becomes 0.1000
+    // If the input number is big, the output will be big.
+    FIXED,
+    // Fixed number of digits (independent of the decimal point).
+    PRECISION
+  };
+
+  // Converts the given double 'v' to digit characters. 'v' must not be NaN,
+  // +Infinity, or -Infinity. In SHORTEST_SINGLE-mode this restriction also
+  // applies to 'v' after it has been casted to a single-precision float. That
+  // is, in this mode static_cast<float>(v) must not be NaN, +Infinity or
+  // -Infinity.
+  //
+  // The result should be interpreted as buffer * 10^(point-length).
+  //
+  // The digits are written to the buffer in the platform's charset, which is
+  // often UTF-8 (with ASCII-range digits) but may be another charset, such
+  // as EBCDIC.
+  //
+  // The output depends on the given mode:
+  //  - SHORTEST: produce the least amount of digits for which the internal
+  //   identity requirement is still satisfied. If the digits are printed
+  //   (together with the correct exponent) then reading this number will give
+  //   'v' again. The buffer will choose the representation that is closest to
+  //   'v'. If there are two at the same distance, than the one farther away
+  //   from 0 is chosen (halfway cases - ending with 5 - are rounded up).
+  //   In this mode the 'requested_digits' parameter is ignored.
+  //  - SHORTEST_SINGLE: same as SHORTEST but with single-precision.
+  //  - FIXED: produces digits necessary to print a given number with
+  //   'requested_digits' digits after the decimal point. The produced digits
+  //   might be too short in which case the caller has to fill the remainder
+  //   with '0's.
+  //   Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2.
+  //   Halfway cases are rounded towards +/-Infinity (away from 0). The call
+  //   toFixed(0.15, 2) thus returns buffer="2", point=0.
+  //   The returned buffer may contain digits that would be truncated from the
+  //   shortest representation of the input.
+  //  - PRECISION: produces 'requested_digits' where the first digit is not '0'.
+  //   Even though the length of produced digits usually equals
+  //   'requested_digits', the function is allowed to return fewer digits, in
+  //   which case the caller has to fill the missing digits with '0's.
+  //   Halfway cases are again rounded away from 0.
+  // DoubleToAscii expects the given buffer to be big enough to hold all
+  // digits and a terminating null-character. In SHORTEST-mode it expects a
+  // buffer of at least kBase10MaximalLength + 1. In all other modes the
+  // requested_digits parameter and the padding-zeroes limit the size of the
+  // output. Don't forget the decimal point, the exponent character and the
+  // terminating null-character when computing the maximal output size.
+  // The given length is only used in debug mode to ensure the buffer is big
+  // enough.
+  static void DoubleToAscii(double v,
+                            DtoaMode mode,
+                            int requested_digits,
+                            char* buffer,
+                            int buffer_length,
+                            bool* sign,
+                            int* length,
+                            int* point);
+
+ private:
+  // Implementation for ToShortest and ToShortestSingle.
+  bool ToShortestIeeeNumber(double value,
+                            StringBuilder* result_builder,
+                            DtoaMode mode) const;
+
+  // If the value is a special value (NaN or Infinity) constructs the
+  // corresponding string using the configured infinity/nan-symbol.
+  // If either of them is NULL or the value is not special then the
+  // function returns false.
+  bool HandleSpecialValues(double value, StringBuilder* result_builder) const;
+  // Constructs an exponential representation (i.e. 1.234e56).
+  // The given exponent assumes a decimal point after the first decimal digit.
+  void CreateExponentialRepresentation(const char* decimal_digits,
+                                       int length,
+                                       int exponent,
+                                       StringBuilder* result_builder) const;
+  // Creates a decimal representation (i.e 1234.5678).
+  void CreateDecimalRepresentation(const char* decimal_digits,
+                                   int length,
+                                   int decimal_point,
+                                   int digits_after_point,
+                                   StringBuilder* result_builder) const;
+
+  const int flags_;
+  const char* const infinity_symbol_;
+  const char* const nan_symbol_;
+  const char exponent_character_;
+  const int decimal_in_shortest_low_;
+  const int decimal_in_shortest_high_;
+  const int max_leading_padding_zeroes_in_precision_mode_;
+  const int max_trailing_padding_zeroes_in_precision_mode_;
+  const int min_exponent_width_;
+
+  DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(DoubleToStringConverter);
+};
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_DOUBLE_TO_STRING_H_
diff --git a/include/arrow/vendored/double-conversion/fast-dtoa.h b/include/arrow/vendored/double-conversion/fast-dtoa.h
index 5f1e8ee..ddd0f04 100644
--- a/include/arrow/vendored/double-conversion/fast-dtoa.h
+++ b/include/arrow/vendored/double-conversion/fast-dtoa.h
@@ -30,6 +30,7 @@
 
 #include "utils.h"
 
+namespace arrow_vendored {
 namespace double_conversion {
 
 enum FastDtoaMode {
@@ -84,5 +85,6 @@ bool FastDtoa(double d,
               int* decimal_point);
 
 }  // namespace double_conversion
+}  // namespace arrow_vendored
 
 #endif  // DOUBLE_CONVERSION_FAST_DTOA_H_
diff --git a/include/arrow/vendored/double-conversion/fixed-dtoa.h b/include/arrow/vendored/double-conversion/fixed-dtoa.h
index 3bdd08e..cf2a59a 100644
--- a/include/arrow/vendored/double-conversion/fixed-dtoa.h
+++ b/include/arrow/vendored/double-conversion/fixed-dtoa.h
@@ -30,6 +30,7 @@
 
 #include "utils.h"
 
+namespace arrow_vendored {
 namespace double_conversion {
 
 // Produces digits necessary to print a given number with
@@ -52,5 +53,6 @@ bool FastFixedDtoa(double v, int fractional_count,
                    Vector<char> buffer, int* length, int* decimal_point);
 
 }  // namespace double_conversion
+}  // namespace arrow_vendored
 
 #endif  // DOUBLE_CONVERSION_FIXED_DTOA_H_
diff --git a/include/arrow/vendored/double-conversion/ieee.h b/include/arrow/vendored/double-conversion/ieee.h
index 8327484..4cedc0b 100644
--- a/include/arrow/vendored/double-conversion/ieee.h
+++ b/include/arrow/vendored/double-conversion/ieee.h
@@ -30,6 +30,7 @@
 
 #include "diy-fp.h"
 
+namespace arrow_vendored {
 namespace double_conversion {
 
 // We assume that doubles and uint64_t have the same endianness.
@@ -41,10 +42,11 @@ static float uint32_to_float(uint32_t d32) { return BitCast<float>(d32); }
 // Helper functions for doubles.
 class Double {
  public:
-  static const uint64_t kSignMask = UINT64_2PART_C(0x80000000, 00000000);
-  static const uint64_t kExponentMask = UINT64_2PART_C(0x7FF00000, 00000000);
-  static const uint64_t kSignificandMask = UINT64_2PART_C(0x000FFFFF, FFFFFFFF);
-  static const uint64_t kHiddenBit = UINT64_2PART_C(0x00100000, 00000000);
+  static const uint64_t kSignMask = DOUBLE_CONVERSION_UINT64_2PART_C(0x80000000, 00000000);
+  static const uint64_t kExponentMask = DOUBLE_CONVERSION_UINT64_2PART_C(0x7FF00000, 00000000);
+  static const uint64_t kSignificandMask = DOUBLE_CONVERSION_UINT64_2PART_C(0x000FFFFF, FFFFFFFF);
+  static const uint64_t kHiddenBit = DOUBLE_CONVERSION_UINT64_2PART_C(0x00100000, 00000000);
+  static const uint64_t kQuietNanBit = DOUBLE_CONVERSION_UINT64_2PART_C(0x00080000, 00000000);
   static const int kPhysicalSignificandSize = 52;  // Excludes the hidden bit.
   static const int kSignificandSize = 53;
   static const int kExponentBias = 0x3FF + kPhysicalSignificandSize;
@@ -59,14 +61,14 @@ class Double {
   // The value encoded by this Double must be greater or equal to +0.0.
   // It must not be special (infinity, or NaN).
   DiyFp AsDiyFp() const {
-    ASSERT(Sign() > 0);
-    ASSERT(!IsSpecial());
+    DOUBLE_CONVERSION_ASSERT(Sign() > 0);
+    DOUBLE_CONVERSION_ASSERT(!IsSpecial());
     return DiyFp(Significand(), Exponent());
   }
 
   // The value encoded by this Double must be strictly greater than 0.
   DiyFp AsNormalizedDiyFp() const {
-    ASSERT(value() > 0.0);
+    DOUBLE_CONVERSION_ASSERT(value() > 0.0);
     uint64_t f = Significand();
     int e = Exponent();
 
@@ -148,6 +150,23 @@ class Double {
         ((d64 & kSignificandMask) != 0);
   }
 
+  bool IsQuietNan() const {
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+    return IsNan() && ((AsUint64() & kQuietNanBit) == 0);
+#else
+    return IsNan() && ((AsUint64() & kQuietNanBit) != 0);
+#endif
+  }
+
+  bool IsSignalingNan() const {
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+    return IsNan() && ((AsUint64() & kQuietNanBit) != 0);
+#else
+    return IsNan() && ((AsUint64() & kQuietNanBit) == 0);
+#endif
+  }
+
+
   bool IsInfinite() const {
     uint64_t d64 = AsUint64();
     return ((d64 & kExponentMask) == kExponentMask) &&
@@ -162,7 +181,7 @@ class Double {
   // Precondition: the value encoded by this Double must be greater or equal
   // than +0.0.
   DiyFp UpperBoundary() const {
-    ASSERT(Sign() > 0);
+    DOUBLE_CONVERSION_ASSERT(Sign() > 0);
     return DiyFp(Significand() * 2 + 1, Exponent() - 1);
   }
 
@@ -171,7 +190,7 @@ class Double {
   // exponent as m_plus.
   // Precondition: the value encoded by this Double must be greater than 0.
   void NormalizedBoundaries(DiyFp* out_m_minus, DiyFp* out_m_plus) const {
-    ASSERT(value() > 0.0);
+    DOUBLE_CONVERSION_ASSERT(value() > 0.0);
     DiyFp v = this->AsDiyFp();
     DiyFp m_plus = DiyFp::Normalize(DiyFp((v.f() << 1) + 1, v.e() - 1));
     DiyFp m_minus;
@@ -225,8 +244,13 @@ class Double {
 
  private:
   static const int kDenormalExponent = -kExponentBias + 1;
-  static const uint64_t kInfinity = UINT64_2PART_C(0x7FF00000, 00000000);
-  static const uint64_t kNaN = UINT64_2PART_C(0x7FF80000, 00000000);
+  static const uint64_t kInfinity = DOUBLE_CONVERSION_UINT64_2PART_C(0x7FF00000, 00000000);
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+  static const uint64_t kNaN = DOUBLE_CONVERSION_UINT64_2PART_C(0x7FF7FFFF, FFFFFFFF);
+#else
+  static const uint64_t kNaN = DOUBLE_CONVERSION_UINT64_2PART_C(0x7FF80000, 00000000);
+#endif
+
 
   const uint64_t d64_;
 
@@ -257,7 +281,7 @@ class Double {
         (biased_exponent << kPhysicalSignificandSize);
   }
 
-  DC_DISALLOW_COPY_AND_ASSIGN(Double);
+  DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(Double);
 };
 
 class Single {
@@ -266,6 +290,7 @@ class Single {
   static const uint32_t kExponentMask = 0x7F800000;
   static const uint32_t kSignificandMask = 0x007FFFFF;
   static const uint32_t kHiddenBit = 0x00800000;
+  static const uint32_t kQuietNanBit = 0x00400000;
   static const int kPhysicalSignificandSize = 23;  // Excludes the hidden bit.
   static const int kSignificandSize = 24;
 
@@ -276,8 +301,8 @@ class Single {
   // The value encoded by this Single must be greater or equal to +0.0.
   // It must not be special (infinity, or NaN).
   DiyFp AsDiyFp() const {
-    ASSERT(Sign() > 0);
-    ASSERT(!IsSpecial());
+    DOUBLE_CONVERSION_ASSERT(Sign() > 0);
+    DOUBLE_CONVERSION_ASSERT(!IsSpecial());
     return DiyFp(Significand(), Exponent());
   }
 
@@ -324,6 +349,23 @@ class Single {
         ((d32 & kSignificandMask) != 0);
   }
 
+  bool IsQuietNan() const {
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+    return IsNan() && ((AsUint32() & kQuietNanBit) == 0);
+#else
+    return IsNan() && ((AsUint32() & kQuietNanBit) != 0);
+#endif
+  }
+
+  bool IsSignalingNan() const {
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+    return IsNan() && ((AsUint32() & kQuietNanBit) != 0);
+#else
+    return IsNan() && ((AsUint32() & kQuietNanBit) == 0);
+#endif
+  }
+
+
   bool IsInfinite() const {
     uint32_t d32 = AsUint32();
     return ((d32 & kExponentMask) == kExponentMask) &&
@@ -340,7 +382,7 @@ class Single {
   // exponent as m_plus.
   // Precondition: the value encoded by this Single must be greater than 0.
   void NormalizedBoundaries(DiyFp* out_m_minus, DiyFp* out_m_plus) const {
-    ASSERT(value() > 0.0);
+    DOUBLE_CONVERSION_ASSERT(value() > 0.0);
     DiyFp v = this->AsDiyFp();
     DiyFp m_plus = DiyFp::Normalize(DiyFp((v.f() << 1) + 1, v.e() - 1));
     DiyFp m_minus;
@@ -358,7 +400,7 @@ class Single {
   // Precondition: the value encoded by this Single must be greater or equal
   // than +0.0.
   DiyFp UpperBoundary() const {
-    ASSERT(Sign() > 0);
+    DOUBLE_CONVERSION_ASSERT(Sign() > 0);
     return DiyFp(Significand() * 2 + 1, Exponent() - 1);
   }
 
@@ -390,13 +432,18 @@ class Single {
   static const int kDenormalExponent = -kExponentBias + 1;
   static const int kMaxExponent = 0xFF - kExponentBias;
   static const uint32_t kInfinity = 0x7F800000;
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+  static const uint32_t kNaN = 0x7FBFFFFF;
+#else
   static const uint32_t kNaN = 0x7FC00000;
+#endif
 
   const uint32_t d32_;
 
-  DC_DISALLOW_COPY_AND_ASSIGN(Single);
+  DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(Single);
 };
 
 }  // namespace double_conversion
+}  // namespace arrow_vendored
 
 #endif  // DOUBLE_CONVERSION_DOUBLE_H_
diff --git a/include/arrow/vendored/double-conversion/string-to-double.h b/include/arrow/vendored/double-conversion/string-to-double.h
new file mode 100644
index 0000000..83eb6fe
--- /dev/null
+++ b/include/arrow/vendored/double-conversion/string-to-double.h
@@ -0,0 +1,240 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_STRING_TO_DOUBLE_H_
+#define DOUBLE_CONVERSION_STRING_TO_DOUBLE_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+class StringToDoubleConverter {
+ public:
+  // Enumeration for allowing octals and ignoring junk when converting
+  // strings to numbers.
+  enum Flags {
+    NO_FLAGS = 0,
+    ALLOW_HEX = 1,
+    ALLOW_OCTALS = 2,
+    ALLOW_TRAILING_JUNK = 4,
+    ALLOW_LEADING_SPACES = 8,
+    ALLOW_TRAILING_SPACES = 16,
+    ALLOW_SPACES_AFTER_SIGN = 32,
+    ALLOW_CASE_INSENSITIVITY = 64,
+    ALLOW_CASE_INSENSIBILITY = 64,  // Deprecated
+    ALLOW_HEX_FLOATS = 128,
+  };
+
+  static const uc16 kNoSeparator = '\0';
+
+  // Flags should be a bit-or combination of the possible Flags-enum.
+  //  - NO_FLAGS: no special flags.
+  //  - ALLOW_HEX: recognizes the prefix "0x". Hex numbers may only be integers.
+  //      Ex: StringToDouble("0x1234") -> 4660.0
+  //          In StringToDouble("0x1234.56") the characters ".56" are trailing
+  //          junk. The result of the call is hence dependent on
+  //          the ALLOW_TRAILING_JUNK flag and/or the junk value.
+  //      With this flag "0x" is a junk-string. Even with ALLOW_TRAILING_JUNK,
+  //      the string will not be parsed as "0" followed by junk.
+  //
+  //  - ALLOW_OCTALS: recognizes the prefix "0" for octals:
+  //      If a sequence of octal digits starts with '0', then the number is
+  //      read as octal integer. Octal numbers may only be integers.
+  //      Ex: StringToDouble("01234") -> 668.0
+  //          StringToDouble("012349") -> 12349.0  // Not a sequence of octal
+  //                                               // digits.
+  //          In StringToDouble("01234.56") the characters ".56" are trailing
+  //          junk. The result of the call is hence dependent on
+  //          the ALLOW_TRAILING_JUNK flag and/or the junk value.
+  //          In StringToDouble("01234e56") the characters "e56" are trailing
+  //          junk, too.
+  //  - ALLOW_TRAILING_JUNK: ignore trailing characters that are not part of
+  //      a double literal.
+  //  - ALLOW_LEADING_SPACES: skip over leading whitespace, including spaces,
+  //                          new-lines, and tabs.
+  //  - ALLOW_TRAILING_SPACES: ignore trailing whitespace.
+  //  - ALLOW_SPACES_AFTER_SIGN: ignore whitespace after the sign.
+  //       Ex: StringToDouble("-   123.2") -> -123.2.
+  //           StringToDouble("+   123.2") -> 123.2
+  //  - ALLOW_CASE_INSENSITIVITY: ignore case of characters for special values:
+  //      infinity and nan.
+  //  - ALLOW_HEX_FLOATS: allows hexadecimal float literals.
+  //      This *must* start with "0x" and separate the exponent with "p".
+  //      Examples: 0x1.2p3 == 9.0
+  //                0x10.1p0 == 16.0625
+  //      ALLOW_HEX and ALLOW_HEX_FLOATS are indented.
+  //
+  // empty_string_value is returned when an empty string is given as input.
+  // If ALLOW_LEADING_SPACES or ALLOW_TRAILING_SPACES are set, then a string
+  // containing only spaces is converted to the 'empty_string_value', too.
+  //
+  // junk_string_value is returned when
+  //  a) ALLOW_TRAILING_JUNK is not set, and a junk character (a character not
+  //     part of a double-literal) is found.
+  //  b) ALLOW_TRAILING_JUNK is set, but the string does not start with a
+  //     double literal.
+  //
+  // infinity_symbol and nan_symbol are strings that are used to detect
+  // inputs that represent infinity and NaN. They can be null, in which case
+  // they are ignored.
+  // The conversion routine first reads any possible signs. Then it compares the
+  // following character of the input-string with the first character of
+  // the infinity, and nan-symbol. If either matches, the function assumes, that
+  // a match has been found, and expects the following input characters to match
+  // the remaining characters of the special-value symbol.
+  // This means that the following restrictions apply to special-value symbols:
+  //  - they must not start with signs ('+', or '-'),
+  //  - they must not have the same first character.
+  //  - they must not start with digits.
+  //
+  // If the separator character is not kNoSeparator, then that specific
+  // character is ignored when in between two valid digits of the significant.
+  // It is not allowed to appear in the exponent.
+  // It is not allowed to lead or trail the number.
+  // It is not allowed to appear twice next to each other.
+  //
+  // Examples:
+  //  flags = ALLOW_HEX | ALLOW_TRAILING_JUNK,
+  //  empty_string_value = 0.0,
+  //  junk_string_value = NaN,
+  //  infinity_symbol = "infinity",
+  //  nan_symbol = "nan":
+  //    StringToDouble("0x1234") -> 4660.0.
+  //    StringToDouble("0x1234K") -> 4660.0.
+  //    StringToDouble("") -> 0.0  // empty_string_value.
+  //    StringToDouble(" ") -> NaN  // junk_string_value.
+  //    StringToDouble(" 1") -> NaN  // junk_string_value.
+  //    StringToDouble("0x") -> NaN  // junk_string_value.
+  //    StringToDouble("-123.45") -> -123.45.
+  //    StringToDouble("--123.45") -> NaN  // junk_string_value.
+  //    StringToDouble("123e45") -> 123e45.
+  //    StringToDouble("123E45") -> 123e45.
+  //    StringToDouble("123e+45") -> 123e45.
+  //    StringToDouble("123E-45") -> 123e-45.
+  //    StringToDouble("123e") -> 123.0  // trailing junk ignored.
+  //    StringToDouble("123e-") -> 123.0  // trailing junk ignored.
+  //    StringToDouble("+NaN") -> NaN  // NaN string literal.
+  //    StringToDouble("-infinity") -> -inf.  // infinity literal.
+  //    StringToDouble("Infinity") -> NaN  // junk_string_value.
+  //
+  //  flags = ALLOW_OCTAL | ALLOW_LEADING_SPACES,
+  //  empty_string_value = 0.0,
+  //  junk_string_value = NaN,
+  //  infinity_symbol = NULL,
+  //  nan_symbol = NULL:
+  //    StringToDouble("0x1234") -> NaN  // junk_string_value.
+  //    StringToDouble("01234") -> 668.0.
+  //    StringToDouble("") -> 0.0  // empty_string_value.
+  //    StringToDouble(" ") -> 0.0  // empty_string_value.
+  //    StringToDouble(" 1") -> 1.0
+  //    StringToDouble("0x") -> NaN  // junk_string_value.
+  //    StringToDouble("0123e45") -> NaN  // junk_string_value.
+  //    StringToDouble("01239E45") -> 1239e45.
+  //    StringToDouble("-infinity") -> NaN  // junk_string_value.
+  //    StringToDouble("NaN") -> NaN  // junk_string_value.
+  //
+  //  flags = NO_FLAGS,
+  //  separator = ' ':
+  //    StringToDouble("1 2 3 4") -> 1234.0
+  //    StringToDouble("1  2") -> NaN // junk_string_value
+  //    StringToDouble("1 000 000.0") -> 1000000.0
+  //    StringToDouble("1.000 000") -> 1.0
+  //    StringToDouble("1.0e1 000") -> NaN // junk_string_value
+  StringToDoubleConverter(int flags,
+                          double empty_string_value,
+                          double junk_string_value,
+                          const char* infinity_symbol,
+                          const char* nan_symbol,
+                          uc16 separator = kNoSeparator)
+      : flags_(flags),
+        empty_string_value_(empty_string_value),
+        junk_string_value_(junk_string_value),
+        infinity_symbol_(infinity_symbol),
+        nan_symbol_(nan_symbol),
+        separator_(separator) {
+  }
+
+  // Performs the conversion.
+  // The output parameter 'processed_characters_count' is set to the number
+  // of characters that have been processed to read the number.
+  // Spaces than are processed with ALLOW_{LEADING|TRAILING}_SPACES are included
+  // in the 'processed_characters_count'. Trailing junk is never included.
+  double StringToDouble(const char* buffer,
+                        int length,
+                        int* processed_characters_count) const;
+
+  // Same as StringToDouble above but for 16 bit characters.
+  double StringToDouble(const uc16* buffer,
+                        int length,
+                        int* processed_characters_count) const;
+
+  // Same as StringToDouble but reads a float.
+  // Note that this is not equivalent to static_cast<float>(StringToDouble(...))
+  // due to potential double-rounding.
+  float StringToFloat(const char* buffer,
+                      int length,
+                      int* processed_characters_count) const;
+
+  // Same as StringToFloat above but for 16 bit characters.
+  float StringToFloat(const uc16* buffer,
+                      int length,
+                      int* processed_characters_count) const;
+
+  // Same as StringToDouble for T = double, and StringToFloat for T = float.
+  template <typename T>
+  T StringTo(const char* buffer,
+             int length,
+             int* processed_characters_count) const;
+
+  // Same as StringTo above but for 16 bit characters.
+  template <typename T>
+  T StringTo(const uc16* buffer,
+             int length,
+             int* processed_characters_count) const;
+
+ private:
+  const int flags_;
+  const double empty_string_value_;
+  const double junk_string_value_;
+  const char* const infinity_symbol_;
+  const char* const nan_symbol_;
+  const uc16 separator_;
+
+  template <class Iterator>
+  double StringToIeee(Iterator start_pointer,
+                      int length,
+                      bool read_as_double,
+                      int* processed_characters_count) const;
+
+  DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(StringToDoubleConverter);
+};
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_STRING_TO_DOUBLE_H_
diff --git a/include/arrow/vendored/double-conversion/strtod.h b/include/arrow/vendored/double-conversion/strtod.h
index ed0293b..619db58 100644
--- a/include/arrow/vendored/double-conversion/strtod.h
+++ b/include/arrow/vendored/double-conversion/strtod.h
@@ -30,6 +30,7 @@
 
 #include "utils.h"
 
+namespace arrow_vendored {
 namespace double_conversion {
 
 // The buffer must only contain digits in the range [0-9]. It must not
@@ -40,6 +41,26 @@ double Strtod(Vector<const char> buffer, int exponent);
 // contain a dot or a sign. It must not start with '0', and must not be empty.
 float Strtof(Vector<const char> buffer, int exponent);
 
+// Same as Strtod, but assumes that 'trimmed' is already trimmed, as if run
+// through TrimAndCut. That is, 'trimmed' must have no leading or trailing
+// zeros, must not be a lone zero, and must not have 'too many' digits.
+double StrtodTrimmed(Vector<const char> trimmed, int exponent);
+
+// Same as Strtof, but assumes that 'trimmed' is already trimmed, as if run
+// through TrimAndCut. That is, 'trimmed' must have no leading or trailing
+// zeros, must not be a lone zero, and must not have 'too many' digits.
+float StrtofTrimmed(Vector<const char> trimmed, int exponent);
+
+inline Vector<const char> TrimTrailingZeros(Vector<const char> buffer) {
+  for (int i = buffer.length() - 1; i >= 0; --i) {
+    if (buffer[i] != '0') {
+      return buffer.SubVector(0, i + 1);
+    }
+  }
+  return Vector<const char>(buffer.start(), 0);
+}
+
 }  // namespace double_conversion
+}  // namespace arrow_vendored
 
 #endif  // DOUBLE_CONVERSION_STRTOD_H_
diff --git a/include/arrow/vendored/double-conversion/utils.h b/include/arrow/vendored/double-conversion/utils.h
index 4328344..332619a 100644
--- a/include/arrow/vendored/double-conversion/utils.h
+++ b/include/arrow/vendored/double-conversion/utils.h
@@ -28,17 +28,35 @@
 #ifndef DOUBLE_CONVERSION_UTILS_H_
 #define DOUBLE_CONVERSION_UTILS_H_
 
+// Use DOUBLE_CONVERSION_NON_PREFIXED_MACROS to get unprefixed macros as was
+// the case in double-conversion releases prior to 3.1.6
+
 #include <cstdlib>
 #include <cstring>
 
+// For pre-C++11 compatibility
+#if __cplusplus >= 201103L
+#define DOUBLE_CONVERSION_NULLPTR nullptr
+#else
+#define DOUBLE_CONVERSION_NULLPTR NULL
+#endif
+
 #include <cassert>
-#ifndef ASSERT
-#define ASSERT(condition)         \
-    assert(condition);
+#ifndef DOUBLE_CONVERSION_ASSERT
+#define DOUBLE_CONVERSION_ASSERT(condition)         \
+    assert(condition)
 #endif
-#ifndef UNIMPLEMENTED
-#define UNIMPLEMENTED() (abort())
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(ASSERT)
+#define ASSERT DOUBLE_CONVERSION_ASSERT
 #endif
+
+#ifndef DOUBLE_CONVERSION_UNIMPLEMENTED
+#define DOUBLE_CONVERSION_UNIMPLEMENTED() (abort())
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UNIMPLEMENTED)
+#define UNIMPLEMENTED DOUBLE_CONVERSION_UNIMPLEMENTED
+#endif
+
 #ifndef DOUBLE_CONVERSION_NO_RETURN
 #ifdef _MSC_VER
 #define DOUBLE_CONVERSION_NO_RETURN __declspec(noreturn)
@@ -46,23 +64,50 @@
 #define DOUBLE_CONVERSION_NO_RETURN __attribute__((noreturn))
 #endif
 #endif
-#ifndef UNREACHABLE
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(NO_RETURN)
+#define NO_RETURN DOUBLE_CONVERSION_NO_RETURN
+#endif
+
+#ifndef DOUBLE_CONVERSION_UNREACHABLE
 #ifdef _MSC_VER
 void DOUBLE_CONVERSION_NO_RETURN abort_noreturn();
 inline void abort_noreturn() { abort(); }
-#define UNREACHABLE()   (abort_noreturn())
+#define DOUBLE_CONVERSION_UNREACHABLE()   (abort_noreturn())
 #else
-#define UNREACHABLE()   (abort())
+#define DOUBLE_CONVERSION_UNREACHABLE()   (abort())
+#endif
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UNREACHABLE)
+#define UNREACHABLE DOUBLE_CONVERSION_UNREACHABLE
 #endif
+
+// Not all compilers support __has_attribute and combining a check for both
+// ifdef and __has_attribute on the same preprocessor line isn't portable.
+#ifdef __has_attribute
+#   define DOUBLE_CONVERSION_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+#   define DOUBLE_CONVERSION_HAS_ATTRIBUTE(x) 0
 #endif
 
 #ifndef DOUBLE_CONVERSION_UNUSED
-#ifdef __GNUC__
+#if DOUBLE_CONVERSION_HAS_ATTRIBUTE(unused)
 #define DOUBLE_CONVERSION_UNUSED __attribute__((unused))
 #else
 #define DOUBLE_CONVERSION_UNUSED
 #endif
 #endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UNUSED)
+#define UNUSED DOUBLE_CONVERSION_UNUSED
+#endif
+
+#if DOUBLE_CONVERSION_HAS_ATTRIBUTE(uninitialized)
+#define DOUBLE_CONVERSION_STACK_UNINITIALIZED __attribute__((uninitialized))
+#else
+#define DOUBLE_CONVERSION_STACK_UNINITIALIZED
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(STACK_UNINITIALIZED)
+#define STACK_UNINITIALIZED DOUBLE_CONVERSION_STACK_UNINITIALIZED
+#endif
 
 // Double operations detection based on target architecture.
 // Linux uses a 80bit wide floating point stack on x86. This induces double
@@ -94,15 +139,18 @@ int main(int argc, char** argv) {
     defined(__ARMEL__) || defined(__avr32__) || defined(_M_ARM) || defined(_M_ARM64) || \
     defined(__hppa__) || defined(__ia64__) || \
     defined(__mips__) || \
+    defined(__loongarch__) || \
+    defined(__nios2__) || defined(__ghs) || \
     defined(__powerpc__) || defined(__ppc__) || defined(__ppc64__) || \
     defined(_POWER) || defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \
     defined(__sparc__) || defined(__sparc) || defined(__s390__) || \
     defined(__SH4__) || defined(__alpha__) || \
     defined(_MIPS_ARCH_MIPS32R2) || defined(__ARMEB__) ||\
     defined(__AARCH64EL__) || defined(__aarch64__) || defined(__AARCH64EB__) || \
-    defined(__riscv) || \
-    defined(__or1k__) || defined(__arc__) || \
-    defined(__EMSCRIPTEN__)
+    defined(__riscv) || defined(__e2k__) || \
+    defined(__or1k__) || defined(__arc__) || defined(__ARC64__) || \
+    defined(__microblaze__) || defined(__XTENSA__) || \
+    defined(__EMSCRIPTEN__) || defined(__wasm32__)
 #define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1
 #elif defined(__mc68000__) || \
     defined(__pnacl__) || defined(__native_client__)
@@ -117,6 +165,9 @@ int main(int argc, char** argv) {
 #else
 #error Target architecture was not detected as supported by Double-Conversion.
 #endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(CORRECT_DOUBLE_OPERATIONS)
+#define CORRECT_DOUBLE_OPERATIONS DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS
+#endif
 
 #if defined(_WIN32) && !defined(__MINGW32__)
 
@@ -140,27 +191,35 @@ typedef uint16_t uc16;
 
 // The following macro works on both 32 and 64-bit platforms.
 // Usage: instead of writing 0x1234567890123456
-//      write UINT64_2PART_C(0x12345678,90123456);
-#define UINT64_2PART_C(a, b) (((static_cast<uint64_t>(a) << 32) + 0x##b##u))
-
+//      write DOUBLE_CONVERSION_UINT64_2PART_C(0x12345678,90123456);
+#define DOUBLE_CONVERSION_UINT64_2PART_C(a, b) (((static_cast<uint64_t>(a) << 32) + 0x##b##u))
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UINT64_2PART_C)
+#define UINT64_2PART_C DOUBLE_CONVERSION_UINT64_2PART_C
+#endif
 
-// The expression ARRAY_SIZE(a) is a compile-time constant of type
+// The expression DOUBLE_CONVERSION_ARRAY_SIZE(a) is a compile-time constant of type
 // size_t which represents the number of elements of the given
-// array. You should only use ARRAY_SIZE on statically allocated
+// array. You should only use DOUBLE_CONVERSION_ARRAY_SIZE on statically allocated
 // arrays.
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(a)                                   \
+#ifndef DOUBLE_CONVERSION_ARRAY_SIZE
+#define DOUBLE_CONVERSION_ARRAY_SIZE(a)                                   \
   ((sizeof(a) / sizeof(*(a))) /                         \
   static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
 #endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(ARRAY_SIZE)
+#define ARRAY_SIZE DOUBLE_CONVERSION_ARRAY_SIZE
+#endif
 
 // A macro to disallow the evil copy constructor and operator= functions
 // This should be used in the private: declarations for a class
-#ifndef DC_DISALLOW_COPY_AND_ASSIGN
-#define DC_DISALLOW_COPY_AND_ASSIGN(TypeName)      \
+#ifndef DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN
+#define DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(TypeName)      \
   TypeName(const TypeName&);                    \
   void operator=(const TypeName&)
 #endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(DC_DISALLOW_COPY_AND_ASSIGN)
+#define DC_DISALLOW_COPY_AND_ASSIGN DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN
+#endif
 
 // A macro to disallow all the implicit constructors, namely the
 // default constructor, copy constructor and operator= functions.
@@ -168,33 +227,21 @@ typedef uint16_t uc16;
 // This should be used in the private: declarations for a class
 // that wants to prevent anyone from instantiating it. This is
 // especially useful for classes containing only static methods.
-#ifndef DC_DISALLOW_IMPLICIT_CONSTRUCTORS
-#define DC_DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
+#ifndef DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS
+#define DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
   TypeName();                                    \
-  DC_DISALLOW_COPY_AND_ASSIGN(TypeName)
+  DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(TypeName)
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(DC_DISALLOW_IMPLICIT_CONSTRUCTORS)
+#define DC_DISALLOW_IMPLICIT_CONSTRUCTORS DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS
 #endif
 
+namespace arrow_vendored {
 namespace double_conversion {
 
-static const int kCharSize = sizeof(char);
-
-// Returns the maximum of the two parameters.
-template <typename T>
-static T Max(T a, T b) {
-  return a < b ? b : a;
-}
-
-
-// Returns the minimum of the two parameters.
-template <typename T>
-static T Min(T a, T b) {
-  return a < b ? a : b;
-}
-
-
 inline int StrLength(const char* string) {
   size_t length = strlen(string);
-  ASSERT(length == static_cast<size_t>(static_cast<int>(length)));
+  DOUBLE_CONVERSION_ASSERT(length == static_cast<size_t>(static_cast<int>(length)));
   return static_cast<int>(length);
 }
 
@@ -202,17 +249,17 @@ inline int StrLength(const char* string) {
 template <typename T>
 class Vector {
  public:
-  Vector() : start_(NULL), length_(0) {}
+  Vector() : start_(DOUBLE_CONVERSION_NULLPTR), length_(0) {}
   Vector(T* data, int len) : start_(data), length_(len) {
-    ASSERT(len == 0 || (len > 0 && data != NULL));
+    DOUBLE_CONVERSION_ASSERT(len == 0 || (len > 0 && data != DOUBLE_CONVERSION_NULLPTR));
   }
 
   // Returns a vector using the same backing storage as this one,
   // spanning from and including 'from', to but not including 'to'.
   Vector<T> SubVector(int from, int to) {
-    ASSERT(to <= length_);
-    ASSERT(from < to);
-    ASSERT(0 <= from);
+    DOUBLE_CONVERSION_ASSERT(to <= length_);
+    DOUBLE_CONVERSION_ASSERT(from < to);
+    DOUBLE_CONVERSION_ASSERT(0 <= from);
     return Vector<T>(start() + from, to - from);
   }
 
@@ -227,7 +274,7 @@ class Vector {
 
   // Access individual vector elements - checks bounds in debug mode.
   T& operator[](int index) const {
-    ASSERT(0 <= index && index < length_);
+    DOUBLE_CONVERSION_ASSERT(0 <= index && index < length_);
     return start_[index];
   }
 
@@ -235,6 +282,11 @@ class Vector {
 
   T& last() { return start_[length_ - 1]; }
 
+  void pop_back() {
+    DOUBLE_CONVERSION_ASSERT(!is_empty());
+    --length_;
+  }
+
  private:
   T* start_;
   int length_;
@@ -255,7 +307,7 @@ class StringBuilder {
 
   // Get the current position in the builder.
   int position() const {
-    ASSERT(!is_finalized());
+    DOUBLE_CONVERSION_ASSERT(!is_finalized());
     return position_;
   }
 
@@ -266,8 +318,8 @@ class StringBuilder {
   // 0-characters; use the Finalize() method to terminate the string
   // instead.
   void AddCharacter(char c) {
-    ASSERT(c != '\0');
-    ASSERT(!is_finalized() && position_ < buffer_.length());
+    DOUBLE_CONVERSION_ASSERT(c != '\0');
+    DOUBLE_CONVERSION_ASSERT(!is_finalized() && position_ < buffer_.length());
     buffer_[position_++] = c;
   }
 
@@ -280,9 +332,9 @@ class StringBuilder {
   // Add the first 'n' characters of the given string 's' to the
   // builder. The input string must have enough characters.
   void AddSubstring(const char* s, int n) {
-    ASSERT(!is_finalized() && position_ + n < buffer_.length());
-    ASSERT(static_cast<size_t>(n) <= strlen(s));
-    memmove(&buffer_[position_], s, n * kCharSize);
+    DOUBLE_CONVERSION_ASSERT(!is_finalized() && position_ + n < buffer_.length());
+    DOUBLE_CONVERSION_ASSERT(static_cast<size_t>(n) <= strlen(s));
+    memmove(&buffer_[position_], s, static_cast<size_t>(n));
     position_ += n;
   }
 
@@ -297,13 +349,13 @@ class StringBuilder {
 
   // Finalize the string by 0-terminating it and returning the buffer.
   char* Finalize() {
-    ASSERT(!is_finalized() && position_ < buffer_.length());
+    DOUBLE_CONVERSION_ASSERT(!is_finalized() && position_ < buffer_.length());
     buffer_[position_] = '\0';
     // Make sure nobody managed to add a 0-character to the
     // buffer while building the string.
-    ASSERT(strlen(buffer_.start()) == static_cast<size_t>(position_));
+    DOUBLE_CONVERSION_ASSERT(strlen(buffer_.start()) == static_cast<size_t>(position_));
     position_ = -1;
-    ASSERT(is_finalized());
+    DOUBLE_CONVERSION_ASSERT(is_finalized());
     return buffer_.start();
   }
 
@@ -313,7 +365,7 @@ class StringBuilder {
 
   bool is_finalized() const { return position_ < 0; }
 
-  DC_DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder);
+  DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder);
 };
 
 // The type-based aliasing rule allows the compiler to assume that pointers of
@@ -341,7 +393,7 @@ class StringBuilder {
 // enough that it can no longer see that you have cast one pointer type to
 // another thus avoiding the warning.
 template <class Dest, class Source>
-inline Dest BitCast(const Source& source) {
+Dest BitCast(const Source& source) {
   // Compile time assertion: sizeof(Dest) == sizeof(Source)
   // A compile error here means your Dest and Source have different sizes.
 #if __cplusplus >= 201103L
@@ -358,10 +410,11 @@ inline Dest BitCast(const Source& source) {
 }
 
 template <class Dest, class Source>
-inline Dest BitCast(Source* source) {
+Dest BitCast(Source* source) {
   return BitCast<Dest>(reinterpret_cast<uintptr_t>(source));
 }
 
 }  // namespace double_conversion
+}  // namespace arrow_vendored
 
 #endif  // DOUBLE_CONVERSION_UTILS_H_
diff --git a/include/arrow/vendored/pcg/pcg_extras.hpp b/include/arrow/vendored/pcg/pcg_extras.hpp
index 760867e..36576cf 100644
--- a/include/arrow/vendored/pcg/pcg_extras.hpp
+++ b/include/arrow/vendored/pcg/pcg_extras.hpp
@@ -612,27 +612,6 @@ class seed_seq_from {
     }
 };
 
-/*
- * Sometimes you might want a distinct seed based on when the program
- * was compiled.  That way, a particular instance of the program will
- * behave the same way, but when recompiled it'll produce a different
- * value.
- */
-
-template <typename IntType>
-struct static_arbitrary_seed {
-private:
-    static constexpr IntType fnv(IntType hash, const char* pos) {
-        return *pos == '\0'
-             ? hash
-             : fnv((hash * IntType(16777619U)) ^ *pos, (pos+1));
-    }
-
-public:
-    static constexpr IntType value = fnv(IntType(2166136261U ^ sizeof(IntType)),
-                        __DATE__ __TIME__ __FILE__);
-};
-
 // Sometimes, when debugging or testing, it's handy to be able print the name
 // of a (in human-readable form).  This code allows the idiom:
 //
diff --git a/include/arrow/vendored/xxhash/xxhash.h b/include/arrow/vendored/xxhash/xxhash.h
index d77b68c..08ab794 100644
--- a/include/arrow/vendored/xxhash/xxhash.h
+++ b/include/arrow/vendored/xxhash/xxhash.h
@@ -32,7 +32,12 @@
  *   - xxHash homepage: https://www.xxhash.com
  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
  */
-
+/*!
+ * @mainpage xxHash
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
 /* TODO: update */
 /* Notice extracted from xxHash homepage:
 
@@ -44,7 +49,7 @@ Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo
 Name            Speed       Q.Score   Author
 xxHash          5.4 GB/s     10
 CrapWow         3.2 GB/s      2       Andrew
-MumurHash 3a    2.7 GB/s     10       Austin Appleby
+MurmurHash 3a   2.7 GB/s     10       Austin Appleby
 SpookyHash      2.0 GB/s     10       Bob Jenkins
 SBox            1.4 GB/s      9       Bret Mulvey
 Lookup3         1.2 GB/s      9       Bob Jenkins
@@ -116,29 +121,80 @@ extern "C" {
 
    /*
     * This part deals with the special case where a unit wants to inline xxHash,
-    * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
-    * as part of some previously included *.h header file.
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+    * such as part of some previously included *.h header file.
     * Without further action, the new include would just be ignored,
     * and functions would effectively _not_ be inlined (silent failure).
     * The following macros solve this situation by prefixing all inlined names,
     * avoiding naming collision with previous inclusions.
     */
-#  ifdef XXH_NAMESPACE
-#    error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
-     /*
-      * Note: Alternative: #undef all symbols (it's a pretty large list).
-      * Without #error: it compiles, but functions are actually not inlined.
-      */
-#  endif
+   /* Before that, we unconditionally #undef all symbols,
+    * in case they were already defined with XXH_NAMESPACE.
+    * They will then be redefined for XXH_INLINE_ALL
+    */
+#  undef XXH_versionNumber
+    /* XXH32 */
+#  undef XXH32
+#  undef XXH32_createState
+#  undef XXH32_freeState
+#  undef XXH32_reset
+#  undef XXH32_update
+#  undef XXH32_digest
+#  undef XXH32_copyState
+#  undef XXH32_canonicalFromHash
+#  undef XXH32_hashFromCanonical
+    /* XXH64 */
+#  undef XXH64
+#  undef XXH64_createState
+#  undef XXH64_freeState
+#  undef XXH64_reset
+#  undef XXH64_update
+#  undef XXH64_digest
+#  undef XXH64_copyState
+#  undef XXH64_canonicalFromHash
+#  undef XXH64_hashFromCanonical
+    /* XXH3_64bits */
+#  undef XXH3_64bits
+#  undef XXH3_64bits_withSecret
+#  undef XXH3_64bits_withSeed
+#  undef XXH3_64bits_withSecretandSeed
+#  undef XXH3_createState
+#  undef XXH3_freeState
+#  undef XXH3_copyState
+#  undef XXH3_64bits_reset
+#  undef XXH3_64bits_reset_withSeed
+#  undef XXH3_64bits_reset_withSecret
+#  undef XXH3_64bits_update
+#  undef XXH3_64bits_digest
+#  undef XXH3_generateSecret
+    /* XXH3_128bits */
+#  undef XXH128
+#  undef XXH3_128bits
+#  undef XXH3_128bits_withSeed
+#  undef XXH3_128bits_withSecret
+#  undef XXH3_128bits_reset
+#  undef XXH3_128bits_reset_withSeed
+#  undef XXH3_128bits_reset_withSecret
+#  undef XXH3_128bits_reset_withSecretandSeed
+#  undef XXH3_128bits_update
+#  undef XXH3_128bits_digest
+#  undef XXH128_isEqual
+#  undef XXH128_cmp
+#  undef XXH128_canonicalFromHash
+#  undef XXH128_hashFromCanonical
+    /* Finally, free the namespace itself */
+#  undef XXH_NAMESPACE
+
+    /* employ the namespace for XXH_INLINE_ALL */
 #  define XXH_NAMESPACE XXH_INLINE_
    /*
-    * Some identifiers (enums, type names) are not symbols, but they must
-    * still be renamed to avoid redeclaration.
+    * Some identifiers (enums, type names) are not symbols,
+    * but they must nonetheless be renamed to avoid redeclaration.
     * Alternative solution: do not redeclare them.
-    * However, this requires some #ifdefs, and is a more dispersed action.
-    * Meanwhile, renaming can be achieved in a single block
+    * However, this requires some #ifdefs, and has a more dispersed impact.
+    * Meanwhile, renaming can be achieved in a single place.
     */
-#  define XXH_IPREF(Id)   XXH_INLINE_ ## Id
+#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
 #  define XXH_OK XXH_IPREF(XXH_OK)
 #  define XXH_ERROR XXH_IPREF(XXH_ERROR)
 #  define XXH_errorcode XXH_IPREF(XXH_errorcode)
@@ -165,6 +221,12 @@ extern "C" {
 #ifndef XXHASH_H_5627135585666179
 #define XXHASH_H_5627135585666179 1
 
+
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
 /* specific declaration modes for Windows */
 #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
 #  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
@@ -178,8 +240,9 @@ extern "C" {
 #  endif
 #endif
 
+#ifdef XXH_DOXYGEN
 /*!
- * XXH_NAMESPACE, aka Namespace Emulation:
+ * @brief Emulate a namespace by transparently prefixing all symbols.
  *
  * If you want to include _and expose_ xxHash functions from within your own
  * library, but also want to avoid symbol collisions with other libraries which
@@ -191,6 +254,10 @@ extern "C" {
  * includes `xxhash.h`: Regular symbol names will be automatically translated
  * by this header.
  */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
 #ifdef XXH_NAMESPACE
 #  define XXH_CAT(A,B) A##B
 #  define XXH_NAME2(A,B) XXH_CAT(A,B)
@@ -219,23 +286,28 @@ extern "C" {
 #  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
 #  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
 #  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
 #  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
 #  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
 #  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
 #  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
 #  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
 #  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
 #  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
 #  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
 #  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
 /* XXH3_128bits */
 #  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
 #  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
 #  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
 #  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
 #  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
 #  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
 #  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
 #  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
 #  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
 #  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
@@ -250,13 +322,22 @@ extern "C" {
 ***************************************/
 #define XXH_VERSION_MAJOR    0
 #define XXH_VERSION_MINOR    8
-#define XXH_VERSION_RELEASE  0
+#define XXH_VERSION_RELEASE  1
 #define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is mostly useful when xxHash is compiled as a shared library,
+ * since the returned value comes from the library, as opposed to header file.
+ *
+ * @return `XXH_VERSION_NUMBER` of the invoked library.
+ */
 XXH_PUBLIC_API unsigned XXH_versionNumber (void);
 
 
 /* ****************************
-*  Definitions
+*  Common basic types
 ******************************/
 #include <stddef.h>   /* size_t */
 typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
@@ -265,11 +346,20 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 /*-**********************************************************************
 *  32-bit hash
 ************************************************************************/
-#if !defined (__VMS) \
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+#elif !defined (__VMS) \
   && (defined (__cplusplus) \
   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
 #   include <stdint.h>
     typedef uint32_t XXH32_hash_t;
+
 #else
 #   include <limits.h>
 #   if UINT_MAX == 0xFFFFFFFFUL
@@ -284,22 +374,48 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 #endif
 
 /*!
- * XXH32():
- *  Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
- *  The memory between input & input+length must be valid (allocated and read-accessible).
- *  "seed" can be used to alter the result predictably.
- *  Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ * @}
+ *
+ * @defgroup xxh32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
  *
- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
- * and offers true 64/128 bit hash results. It provides a superior level of
- * dispersion, and greatly reduces the risks of collisions.
+ * @note
+ *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
+ *   Note that @ref xxh3_family provides competitive speed
+ *   for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.
+ *
+ * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
+ * @see @ref xxh32_impl for implementation details
+ * @{
  */
-XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
 
-/*******   Streaming   *******/
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit hash value.
+ *
+ * @see
+ *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
 
-/*
- * Streaming functions generate the xxHash value from an incrememtal input.
+/*!
+ * Streaming functions generate the xxHash value from an incremental input.
  * This method is slower than single-call functions, due to state management.
  * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
  *
@@ -319,15 +435,117 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_
  * digest, and generate new hash values later on by invoking `XXH*_digest()`.
  *
  * When done, release the state using `XXH*_freeState()`.
+ *
+ * Example code for incrementally hashing a file:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <xxhash.h>
+ *    #define BUFFER_SIZE 256
+ *
+ *    // Note: XXH64 and XXH3 use the same interface.
+ *    XXH32_hash_t
+ *    hashFile(FILE* stream)
+ *    {
+ *        XXH32_state_t* state;
+ *        unsigned char buf[BUFFER_SIZE];
+ *        size_t amt;
+ *        XXH32_hash_t hash;
+ *
+ *        state = XXH32_createState();       // Create a state
+ *        assert(state != NULL);             // Error check here
+ *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
+ *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
+ *            XXH32_update(state, buf, amt); // Hash the file in chunks
+ *        }
+ *        hash = XXH32_digest(state);        // Finalize the hash
+ *        XXH32_freeState(state);            // Clean up
+ *        return hash;
+ *    }
+ * @endcode
+ */
+
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
  */
+typedef struct XXH32_state_s XXH32_state_t;
 
-typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * Must be freed with XXH32_freeState().
+ * @return An allocated XXH32_state_t on success, `NULL` on failure.
+ */
 XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * Must be allocated with XXH32_createState().
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ * @return XXH_OK.
+ */
 XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
 
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
 XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
 XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated xxHash32 value from that state.
+ */
 XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
 
 /*******   Canonical representation   *******/
@@ -351,41 +569,151 @@ XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
  * canonical format.
  */
 
-typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ */
 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ */
 XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
 
 
+#ifdef __has_attribute
+# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+# define XXH_HAS_ATTRIBUTE(x) 0
+#endif
+
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
+# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define XXH_HAS_C_ATTRIBUTE(x) 0
+#endif
+
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define XXH_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+/*
+Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
+introduced in CPP17 and C23.
+CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
+*/
+#if XXH_HAS_C_ATTRIBUTE(x)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_CPP_ATTRIBUTE(x)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+# define XXH_FALLTHROUGH __attribute__ ((fallthrough))
+#else
+# define XXH_FALLTHROUGH
+#endif
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
 #ifndef XXH_NO_LONG_LONG
 /*-**********************************************************************
 *  64-bit hash
 ************************************************************************/
-#if !defined (__VMS) \
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
   && (defined (__cplusplus) \
   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
-#   include <stdint.h>
-    typedef uint64_t XXH64_hash_t;
+#  include <stdint.h>
+   typedef uint64_t XXH64_hash_t;
 #else
-    /* the following type must have a width of 64-bit */
-    typedef unsigned long long XXH64_hash_t;
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
 #endif
 
 /*!
- * XXH64():
- * Returns the 64-bit hash of sequence of length @length stored at memory
- * address @input.
- * @seed can be used to alter the result predictably.
+ * @}
+ *
+ * @defgroup xxh64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results.
+ *   It provides better speed for systems with vector processing capabilities.
+ */
+
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
  *
  * This function usually runs faster on 64-bit systems, but slower on 32-bit
  * systems (see benchmark).
  *
- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
- * and offers true 64/128 bit hash results. It provides a superior level of
- * dispersion, and greatly reduces the risks of collisions.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit hash.
+ *
+ * @see
+ *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
  */
-XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
 
 /*******   Streaming   *******/
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ */
 typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
 XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
 XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
@@ -400,13 +728,14 @@ typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t
 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
 XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
 
-
-/*-**********************************************************************
-*  XXH3 64-bit variant
-************************************************************************/
-
-/* ************************************************************************
- * XXH3 is a new hash algorithm featuring:
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup xxh3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
  *  - Improved speed for both small and large inputs
  *  - True 64-bit and 128-bit outputs
  *  - SIMD acceleration
@@ -416,46 +745,38 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
  *
  *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
  *
- * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
- * faster on small ones compared to XXH64, though exact differences depend on
- * the platform.
- *
- * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
- * on all platforms.
- *
- * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
  *
- * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
- * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
- * explained in the implementation.
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Any 32-bit and 64-bit targets that can run XXH32 smoothly
+ * can run XXH3 at competitive speeds, even without vector support.
+ * Further details are explained in the implementation.
  *
  * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
- * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generage exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
  *
  * XXH3 offers 2 variants, _64bits and _128bits.
- * When only 64 bits are needed, prefer calling the _64bits variant, as it
- * reduces the amount of mixing, resulting in faster speed on small inputs.
  *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
  * It's also generally simpler to manipulate a scalar return type than a struct.
  *
- * The 128-bit version adds additional strength, but it is slightly slower.
- *
- * The XXH3 algorithm is still in development.
- * The results it produces may still change in future versions.
- *
- * Results produced by v0.7.x are not comparable with results from v0.7.y.
- * However, the API is completely stable, and it can safely be used for
- * ephemeral data (local sessions).
- *
- * Avoid storing values in long-term storage until the algorithm is finalized.
- * XXH3's return values will be officially finalized upon reaching v0.8.0.
- *
- * After which, return values of XXH3 and XXH128 will no longer change in
- * future versions.
- *
  * The API supports one-shot hashing, streaming mode, and custom secrets.
  */
 
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
+
 /* XXH3_64bits():
  * default 64-bit variant, using default secret and default seed of 0.
  * It's the fastest variant. */
@@ -470,20 +791,32 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
  */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
 
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+
 /*
  * XXH3_64bits_withSecret():
  * It's possible to provide any blob of bytes as a "secret" to generate the hash.
  * This makes it more difficult for an external actor to prepare an intentional collision.
  * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
- * However, the quality of produced hash values depends on secret's entropy.
- * Technically, the secret must look like a bunch of random bytes.
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
  * Avoid "trivial" or structured data such as repeated sequences or a text document.
- * Whenever unsure about the "randomness" of the blob of bytes,
- * consider relabelling it as a "custom seed" instead,
- * and employ "XXH3_generateSecret()" (see below)
- * to generate a high entropy secret derived from the custom seed.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing "XXH3_generateSecret()" instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
  */
-#define XXH3_SECRET_SIZE_MIN 136
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
 
 
@@ -494,6 +827,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len,
  * As a consequence, streaming is slower than one-shot hashing.
  * For better performance, prefer one-shot functions whenever applicable.
  */
+
+/*!
+ * @brief The state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ */
 typedef struct XXH3_state_s XXH3_state_t;
 XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
@@ -533,9 +872,15 @@ XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
 *  XXH3 128-bit variant
 ************************************************************************/
 
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
 typedef struct {
- XXH64_hash_t low64;
- XXH64_hash_t high64;
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
 } XXH128_hash_t;
 
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
@@ -592,6 +937,9 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t*
 
 #endif  /* XXH_NO_LONG_LONG */
 
+/*!
+ * @}
+ */
 #endif /* XXHASH_H_5627135585666179 */
 
 
@@ -612,36 +960,57 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t*
  * Never **ever** access their members directly.
  */
 
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
 struct XXH32_state_s {
-   XXH32_hash_t total_len_32;
-   XXH32_hash_t large_len;
-   XXH32_hash_t v1;
-   XXH32_hash_t v2;
-   XXH32_hash_t v3;
-   XXH32_hash_t v4;
-   XXH32_hash_t mem32[4];
-   XXH32_hash_t memsize;
-   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t v[4];         /*!< Accumulator lanes */
+   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read or write to it, it may be removed. */
 };   /* typedef'd to XXH32_state_t */
 
 
 #ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
 
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
 struct XXH64_state_s {
-   XXH64_hash_t total_len;
-   XXH64_hash_t v1;
-   XXH64_hash_t v2;
-   XXH64_hash_t v3;
-   XXH64_hash_t v4;
-   XXH64_hash_t mem64[4];
-   XXH32_hash_t memsize;
-   XXH32_hash_t reserved32;  /* required for padding anyway */
-   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t v[4];         /*!< Accumulator lanes */
+   XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it, it may be removed. */
 };   /* typedef'd to XXH64_state_t */
 
-#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
 #  include <stdalign.h>
 #  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+/* In C++ alignas() is a keyword */
+#  define XXH_ALIGN(n)      alignas(n)
 #elif defined(__GNUC__)
 #  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
 #elif defined(_MSC_VER)
@@ -652,35 +1021,88 @@ struct XXH64_state_s {
 
 /* Old GCC versions only accept the attribute after the type in structures. */
 #if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
     && defined(__GNUC__)
 #   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
 #else
 #   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
 #endif
 
+/*!
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
 #define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+ *
+ * This is the size used in @ref XXH3_kSecret and the seeded functions.
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
 #define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
 struct XXH3_state_s {
    XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
-   /* used to store a custom secret generated from a seed */
+       /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */
    XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
    XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
    XXH32_hash_t bufferedSize;
-   XXH32_hash_t reserved32;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+   XXH32_hash_t useSeed;
+       /*!< Reserved field. Needed for padding on 64-bit. */
    size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
    XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
    size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
    size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
    XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
    XXH64_hash_t reserved64;
-   const unsigned char* extSecret;  /* reference to external secret;
-                                     * if == NULL, use .customSecret instead */
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
    /* note: there may be some padding at the end due to alignment on 64 bytes */
 }; /* typedef'd to XXH3_state_t */
 
 #undef XXH_ALIGN_MEMBER
 
-/* When the XXH3_state_t structure is merely emplaced on stack,
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
  * it should be initialized with XXH3_INITSTATE() or a memset()
  * in case its first reset uses XXH3_NNbits_reset_withSeed().
  * This init can be omitted if the first reset uses default or _withSecret mode.
@@ -691,6 +1113,12 @@ struct XXH3_state_s {
 #define XXH3_INITSTATE(XXH3_state_ptr)   { (XXH3_state_ptr)->seed = 0; }
 
 
+/* XXH128() :
+ * simple alias to pre-selected XXH3_128bits variant
+ */
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+
+
 /* ===   Experimental API   === */
 /* Symbols defined below must be considered tied to a specific library version. */
 
@@ -703,36 +1131,92 @@ struct XXH3_state_s {
  * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
  *
  * The function accepts as input a custom seed of any length and any content,
- * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
- * into an already allocated buffer secretBuffer.
- * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
+ * and derives from it a high-entropy secret of length @secretSize
+ * into an already allocated buffer @secretBuffer.
+ * @secretSize must be >= XXH3_SECRET_SIZE_MIN
  *
  * The generated secret can then be used with any `*_withSecret()` variant.
  * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
  * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
  * are part of this list. They all accept a `secret` parameter
- * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
+ * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
  * _and_ feature very high entropy (consist of random-looking bytes).
  * These conditions can be a high bar to meet, so
- * this function can be used to generate a secret of proper quality.
+ * XXH3_generateSecret() can be employed to ensure proper quality.
  *
  * customSeed can be anything. It can have any size, even small ones,
- * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
- * The resulting `secret` will nonetheless provide all expected qualities.
+ * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.
+ * The resulting `secret` will nonetheless provide all required qualities.
  *
- * Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
  * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
  */
-XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
+XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
 
 
-/* simple short-cut to pre-selected XXH3_128bits variant */
-XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+/*
+ * XXH3_generateSecret_fromSeed():
+ *
+ * Generate the same secret as the _withSeed() variants.
+ *
+ * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).
+ * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.
+ *
+ * The generated secret can be used in combination with
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
+ * This generator is notably useful in combination with `_withSecretandSeed()`,
+ * as a way to emulate a faster `_withSeed()` variant.
+ */
+XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
 
+/*
+ * *_withSecretandSeed() :
+ * These variants generate hash values using either
+ * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
+ * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).
+ *
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
+ * which requires more instructions than _withSeed() variants.
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
+ *
+ * When @secret has been generated by XXH3_generateSecret_fromSeed(),
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
+ * hence offering only a pure speed benefit on "large" input,
+ * by skipping the need to regenerate the secret for every large input.
+ *
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
+ * for example with XXH3_64bits(), which then becomes the seed,
+ * and then employ both the seed and the secret in _withSecretandSeed().
+ * On top of speed, an added benefit is that each bit in the secret
+ * has a 50% chance to swap each bit in the output,
+ * via its impact to the seed.
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
+ * because only portions of the secret are employed for small data.
+ */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(const void* data, size_t len,
+                              const void* secret, size_t secretSize,
+                              XXH64_hash_t seed);
 
-#endif  /* XXH_NO_LONG_LONG */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(const void* data, size_t len,
+                               const void* secret, size_t secretSize,
+                               XXH64_hash_t seed64);
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
+                                    const void* secret, size_t secretSize,
+                                    XXH64_hash_t seed64);
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
+                                     const void* secret, size_t secretSize,
+                                     XXH64_hash_t seed64);
 
 
+#endif  /* XXH_NO_LONG_LONG */
 #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
 #  define XXH_IMPLEMENTATION
 #endif
@@ -774,8 +1258,24 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
 /* *************************************
 *  Tuning parameters
 ***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
 /*!
- * XXH_FORCE_MEMORY_ACCESS:
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref xxh32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
  * By default, access to unaligned memory is controlled by `memcpy()`, which is
  * safe and portable.
  *
@@ -784,58 +1284,65 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
  *
  * The below switch allow selection of a different access method
  * in the search for improved performance.
- * Method 0 (default):
- *     Use `memcpy()`. Safe and portable. Default.
- * Method 1:
- *     `__attribute__((packed))` statement. It depends on compiler extensions
- *     and is therefore not portable.
- *     This method is safe if your compiler supports it, and *generally* as
- *     fast or faster than `memcpy`.
- * Method 2:
- *     Direct access via cast. This method doesn't depend on the compiler but
- *     violates the C standard.
- *     It can generate buggy code on targets which do not support unaligned
- *     memory accesses.
- *     But in some circumstances, it's the only known way to get the most
- *     performance (example: GCC + ARMv6)
- * Method 3:
- *     Byteshift. This can generate the best code on old compilers which don't
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
  *     inline small `memcpy()` calls, and it might also be faster on big-endian
- *     systems which lack a native byteswap instruction.
- * See https://stackoverflow.com/a/32095106/646947 for details.
- * Prefer these methods in priority order (0 > 1 > 2 > 3)
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *  .
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
  */
-#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
-#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
-#    define XXH_FORCE_MEMORY_ACCESS 2
-#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
-  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
-#    define XXH_FORCE_MEMORY_ACCESS 1
-#  endif
-#endif
-
-/*!
- * XXH_ACCEPT_NULL_INPUT_POINTER:
- * If the input pointer is NULL, xxHash's default behavior is to dereference it,
- * triggering a segfault.
- * When this macro is enabled, xxHash actively checks the input for a null pointer.
- * If it is, the result for null input pointers is the same as a zero-length input.
- */
-#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
-#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
-#endif
+#  define XXH_FORCE_MEMORY_ACCESS 0
 
 /*!
- * XXH_FORCE_ALIGN_CHECK:
- * This is an important performance trick
- * for architectures without decent unaligned memory access performance.
- * It checks for input alignment, and when conditions are met,
- * uses a "fast path" employing direct 32-bit/64-bit read,
- * resulting in _dramatically faster_ read speed.
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
  *
- * The check costs one initial branch per hash, which is generally negligible, but not zero.
- * Moreover, it's not useful to generate binary for an additional code path
- * if memory access uses same instruction for both aligned and unaligned adresses.
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
  *
  * In these cases, the alignment check can be removed by setting this macro to 0.
  * Then the code will always use unaligned memory access.
@@ -844,17 +1351,11 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
  *
  * This option does not affect XXH3 (only XXH32 and XXH64).
  */
-#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
-#  if defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) \
-   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64) /* visual */
-#    define XXH_FORCE_ALIGN_CHECK 0
-#  else
-#    define XXH_FORCE_ALIGN_CHECK 1
-#  endif
-#endif
+#  define XXH_FORCE_ALIGN_CHECK 0
 
 /*!
- * XXH_NO_INLINE_HINTS:
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
  *
  * By default, xxHash tries to force the compiler to inline almost all internal
  * functions.
@@ -872,6 +1373,63 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
  * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
  * -fno-inline with GCC or Clang, this will automatically be defined.
  */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH32_ENDJMP
+ * @brief Whether to use a jump for `XXH32_finalize`.
+ *
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
+ * This is generally preferable for performance,
+ * but depending on exact architecture, a jmp may be preferable.
+ *
+ * This setting is only possibly making a difference for very small inputs.
+ */
+#  define XXH32_ENDJMP 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
+#  if !defined(__clang__) && \
+( \
+    (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+    ( \
+        defined(__GNUC__) && ( \
+            (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
+            ( \
+                defined(__mips__) && \
+                (__mips <= 5 || __mips_isa_rev < 6) && \
+                (!defined(__mips16) || defined(__mips_mips16e2)) \
+            ) \
+        ) \
+    ) \
+)
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+#  if defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
 #ifndef XXH_NO_INLINE_HINTS
 #  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
    || defined(__NO_INLINE__)     /* -O0, -fno-inline */
@@ -881,36 +1439,44 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
 #  endif
 #endif
 
+#ifndef XXH32_ENDJMP
+/* generally preferable for performance */
+#  define XXH32_ENDJMP 0
+#endif
+
 /*!
- * XXH_REROLL:
- * Whether to reroll XXH32_finalize, and XXH64_finalize,
- * instead of using an unrolled jump table/if statement loop.
- *
- * This is automatically defined on -Os/-Oz on GCC and Clang.
+ * @defgroup impl Implementation
+ * @{
  */
-#ifndef XXH_REROLL
-#  if defined(__OPTIMIZE_SIZE__)
-#    define XXH_REROLL 1
-#  else
-#    define XXH_REROLL 0
-#  endif
-#endif
 
 
 /* *************************************
 *  Includes & Memory related functions
 ***************************************/
-/*!
+/*
  * Modify the local functions below should you wish to use
  * different memory routines for malloc() and free()
  */
 #include <stdlib.h>
 
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
 static void* XXH_malloc(size_t s) { return malloc(s); }
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
 static void XXH_free(void* p) { free(p); }
 
-/*! and for memcpy() */
 #include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
 static void* XXH_memcpy(void* dest, const void* src, size_t size)
 {
     return memcpy(dest,src,size);
@@ -927,19 +1493,19 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
 #endif
 
 #if XXH_NO_INLINE_HINTS  /* disable inlining hints */
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) || defined(__clang__)
 #    define XXH_FORCE_INLINE static __attribute__((unused))
 #  else
 #    define XXH_FORCE_INLINE static
 #  endif
 #  define XXH_NO_INLINE static
 /* enable inlining hints */
+#elif defined(__GNUC__) || defined(__clang__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
+#  define XXH_NO_INLINE static __attribute__((noinline))
 #elif defined(_MSC_VER)  /* Visual Studio */
 #  define XXH_FORCE_INLINE static __forceinline
 #  define XXH_NO_INLINE static __declspec(noinline)
-#elif defined(__GNUC__)
-#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
-#  define XXH_NO_INLINE static __attribute__((noinline))
 #elif defined (__cplusplus) \
   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
 #  define XXH_FORCE_INLINE static inline
@@ -954,7 +1520,11 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
 /* *************************************
 *  Debug
 ***************************************/
-/*
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
  * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
  * compiler's command line options. The value must be a number.
  */
@@ -974,8 +1544,39 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
 #endif
 
 /* note: use after variable declarations */
-#define XXH_STATIC_ASSERT(c)  do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
+#ifndef XXH_STATIC_ASSERT
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
+#    include <assert.h>
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  else
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
+#  endif
+#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
+#endif
 
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
 
 /* *************************************
 *  Basic Types
@@ -998,6 +1599,56 @@ typedef XXH32_hash_t xxh_u32;
 
 /* ***   Memory access   *** */
 
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
 /*
  * Manual byteshift. Best for old compilers which don't inline memcpy.
@@ -1032,28 +1683,35 @@ static xxh_u32 XXH_read32(const void* ptr)
 
 /*
  * Portable and safe solution. Generally efficient.
- * see: https://stackoverflow.com/a/32095106/646947
+ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
  */
 static xxh_u32 XXH_read32(const void* memPtr)
 {
     xxh_u32 val;
-    memcpy(&val, memPtr, sizeof(val));
+    XXH_memcpy(&val, memPtr, sizeof(val));
     return val;
 }
 
 #endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
 
-/* ***   Endianess   *** */
-typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+/* ***   Endianness   *** */
 
 /*!
- * XXH_CPU_LITTLE_ENDIAN:
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
  * Defined to 1 if the target is little endian, or 0 if it is big endian.
  * It can be defined externally, for example on the compiler command line.
  *
- * If it is not defined, a runtime check (which is usually constant folded)
- * is used instead.
+ * If it is not defined,
+ * a runtime check (which is usually constant folded) is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
  */
 #ifndef XXH_CPU_LITTLE_ENDIAN
 /*
@@ -1068,8 +1726,11 @@ typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
      || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 #    define XXH_CPU_LITTLE_ENDIAN 0
 #  else
-/*
- * runtime test, presumed to simplify to a constant by compiler
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
  */
 static int XXH_isLittleEndian(void)
 {
@@ -1098,6 +1759,19 @@ static int XXH_isLittleEndian(void)
 #  define XXH_HAS_BUILTIN(x) 0
 #endif
 
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
 #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
                                && XXH_HAS_BUILTIN(__builtin_rotateleft64)
 #  define XXH_rotl32 __builtin_rotateleft32
@@ -1111,6 +1785,14 @@ static int XXH_isLittleEndian(void)
 #  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
 #endif
 
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
 #if defined(_MSC_VER)     /* Visual Studio */
 #  define XXH_swap32 _byteswap_ulong
 #elif XXH_GCC_VERSION >= 403
@@ -1129,7 +1811,15 @@ static xxh_u32 XXH_swap32 (xxh_u32 x)
 /* ***************************
 *  Memory reads
 *****************************/
-typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
 
 /*
  * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
@@ -1182,17 +1872,25 @@ XXH_readLE32_align(const void* ptr, XXH_alignment align)
 /* *************************************
 *  Misc
 ***************************************/
+/*! @ingroup public */
 XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
 
 
 /* *******************************************************************
 *  32-bit hash functions
 *********************************************************************/
-static const xxh_u32 XXH_PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
-static const xxh_u32 XXH_PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
-static const xxh_u32 XXH_PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
-static const xxh_u32 XXH_PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
-static const xxh_u32 XXH_PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+/*!
+ * @}
+ * @defgroup xxh32_impl XXH32 implementation
+ * @ingroup impl
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
 
 #ifdef XXH_OLD_NAMES
 #  define PRIME32_1 XXH_PRIME32_1
@@ -1202,18 +1900,28 @@ static const xxh_u32 XXH_PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011
 #  define PRIME32_5 XXH_PRIME32_5
 #endif
 
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
 static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
 {
     acc += input * XXH_PRIME32_2;
     acc  = XXH_rotl32(acc, 13);
     acc *= XXH_PRIME32_1;
-#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
     /*
      * UGLY HACK:
-     * This inline assembly hack forces acc into a normal register. This is the
-     * only thing that prevents GCC and Clang from autovectorizing the XXH32
-     * loop (pragmas and attributes don't work for some resason) without globally
-     * disabling SSE4.1.
+     * A compiler fence is the only thing that prevents GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
      *
      * The reason we want to avoid vectorization is because despite working on
      * 4 integers at a time, there are multiple factors slowing XXH32 down on
@@ -1238,27 +1946,25 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
      *   can load data, while v3 can multiply. SSE forces them to operate
      *   together.
      *
-     * How this hack works:
-     * __asm__(""       // Declare an assembly block but don't declare any instructions
-     *          :       // However, as an Input/Output Operand,
-     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
-     *          (acc)   // and set acc as the operand
-     * );
-     *
-     * Because of the 'r', the compiler has promised that seed will be in a
-     * general purpose register and the '+' says that it will be 'read/write',
-     * so it has to assume it has changed. It is like volatile without all the
-     * loads and stores.
-     *
-     * Since the argument has to be in a normal register (not an SSE register),
-     * each time XXH32_round is called, it is impossible to vectorize.
+     * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
+     * and it is pointless writing a NEON implementation that is basically the
+     * same speed as scalar for XXH32.
      */
-    __asm__("" : "+r" (acc));
+    XXH_COMPILER_GUARD(acc);
 #endif
     return acc;
 }
 
-/* mix all bits */
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param h32 The hash to avalanche.
+ * @return The avalanched hash.
+ */
 static xxh_u32 XXH32_avalanche(xxh_u32 h32)
 {
     h32 ^= h32 >> 15;
@@ -1271,6 +1977,20 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32)
 
 #define XXH_get32bits(p) XXH_readLE32_align(p, align)
 
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param h32 The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ */
 static xxh_u32
 XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
 {
@@ -1285,8 +2005,10 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
     h32  = XXH_rotl32(h32, 17) * XXH_PRIME32_4;     \
 } while (0)
 
-    /* Compact rerolled version */
-    if (XXH_REROLL) {
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+
+    /* Compact rerolled version; generally faster */
+    if (!XXH32_ENDJMP) {
         len &= 15;
         while (len >= 4) {
             XXH_PROCESS4;
@@ -1300,41 +2022,41 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
     } else {
          switch(len&15) /* or switch(bEnd - p) */ {
            case 12:      XXH_PROCESS4;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 8:       XXH_PROCESS4;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 4:       XXH_PROCESS4;
                          return XXH32_avalanche(h32);
 
            case 13:      XXH_PROCESS4;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 9:       XXH_PROCESS4;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 5:       XXH_PROCESS4;
                          XXH_PROCESS1;
                          return XXH32_avalanche(h32);
 
            case 14:      XXH_PROCESS4;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 10:      XXH_PROCESS4;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 6:       XXH_PROCESS4;
                          XXH_PROCESS1;
                          XXH_PROCESS1;
                          return XXH32_avalanche(h32);
 
            case 15:      XXH_PROCESS4;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 11:      XXH_PROCESS4;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 7:       XXH_PROCESS4;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 3:       XXH_PROCESS1;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 2:       XXH_PROCESS1;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 1:       XXH_PROCESS1;
-                         /* fallthrough */
+                         XXH_FALLTHROUGH;
            case 0:       return XXH32_avalanche(h32);
         }
         XXH_ASSERT(0);
@@ -1350,20 +2072,23 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
 #  undef XXH_PROCESS4
 #endif
 
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input , len , seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
 XXH_FORCE_INLINE xxh_u32
 XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
 {
-    const xxh_u8* bEnd = input + len;
     xxh_u32 h32;
 
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
-    if (input==NULL) {
-        len=0;
-        bEnd=input=(const xxh_u8*)(size_t)16;
-    }
-#endif
+    if (input==NULL) XXH_ASSERT(len == 0);
 
     if (len>=16) {
+        const xxh_u8* const bEnd = input + len;
         const xxh_u8* const limit = bEnd - 15;
         xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
         xxh_u32 v2 = seed + XXH_PRIME32_2;
@@ -1388,7 +2113,7 @@ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment
     return XXH32_finalize(h32, input, len&15, align);
 }
 
-
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
 {
 #if 0
@@ -1397,9 +2122,7 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t s
     XXH32_reset(&state, seed);
     XXH32_update(&state, (const xxh_u8*)input, len);
     return XXH32_digest(&state);
-
 #else
-
     if (XXH_FORCE_ALIGN_CHECK) {
         if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
             return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
@@ -1412,45 +2135,49 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t s
 
 
 /*******   Hash streaming   *******/
-
+/*!
+ * @ingroup xxh32_family
+ */
 XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
 {
     return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
 }
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
 {
     XXH_free(statePtr);
     return XXH_OK;
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
 {
-    memcpy(dstState, srcState, sizeof(*dstState));
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
 }
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
 {
     XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
     memset(&state, 0, sizeof(state));
-    state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
-    state.v2 = seed + XXH_PRIME32_2;
-    state.v3 = seed + 0;
-    state.v4 = seed - XXH_PRIME32_1;
+    state.v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    state.v[1] = seed + XXH_PRIME32_2;
+    state.v[2] = seed + 0;
+    state.v[3] = seed - XXH_PRIME32_1;
     /* do not write into reserved, planned to be removed in a future version */
-    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    XXH_memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
     return XXH_OK;
 }
 
 
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH32_update(XXH32_state_t* state, const void* input, size_t len)
 {
-    if (input==NULL)
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
         return XXH_OK;
-#else
-        return XXH_ERROR;
-#endif
+    }
 
     {   const xxh_u8* p = (const xxh_u8*)input;
         const xxh_u8* const bEnd = p + len;
@@ -1467,10 +2194,10 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
         if (state->memsize) {   /* some data left from previous update */
             XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
             {   const xxh_u32* p32 = state->mem32;
-                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
-                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
-                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
-                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
             }
             p += 16-state->memsize;
             state->memsize = 0;
@@ -1478,22 +2205,14 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
 
         if (p <= bEnd-16) {
             const xxh_u8* const limit = bEnd - 16;
-            xxh_u32 v1 = state->v1;
-            xxh_u32 v2 = state->v2;
-            xxh_u32 v3 = state->v3;
-            xxh_u32 v4 = state->v4;
 
             do {
-                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
-                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
-                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
-                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
             } while (p<=limit);
 
-            state->v1 = v1;
-            state->v2 = v2;
-            state->v3 = v3;
-            state->v4 = v4;
         }
 
         if (p < bEnd) {
@@ -1506,17 +2225,18 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
 }
 
 
-XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
 {
     xxh_u32 h32;
 
     if (state->large_len) {
-        h32 = XXH_rotl32(state->v1, 1)
-            + XXH_rotl32(state->v2, 7)
-            + XXH_rotl32(state->v3, 12)
-            + XXH_rotl32(state->v4, 18);
+        h32 = XXH_rotl32(state->v[0], 1)
+            + XXH_rotl32(state->v[1], 7)
+            + XXH_rotl32(state->v[2], 12)
+            + XXH_rotl32(state->v[3], 18);
     } else {
-        h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
+        h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
     }
 
     h32 += state->total_len_32;
@@ -1527,7 +2247,8 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
 
 /*******   Canonical representation   *******/
 
-/*
+/*!
+ * @ingroup xxh32_family
  * The default return values from XXH functions are unsigned 32 and 64 bit
  * integers.
  *
@@ -1544,9 +2265,9 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t
 {
     XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
     if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
-    memcpy(dst, &hash, sizeof(*dst));
+    XXH_memcpy(dst, &hash, sizeof(*dst));
 }
-
+/*! @ingroup xxh32_family */
 XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
 {
     return XXH_readBE32(src);
@@ -1558,7 +2279,11 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src
 /* *******************************************************************
 *  64-bit hash functions
 *********************************************************************/
-
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
 /*******   Memory access   *******/
 
 typedef XXH64_hash_t xxh_u64;
@@ -1567,35 +2292,6 @@ typedef XXH64_hash_t xxh_u64;
 #  define U64 xxh_u64
 #endif
 
-/*!
- * XXH_REROLL_XXH64:
- * Whether to reroll the XXH64_finalize() loop.
- *
- * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
- * performance gain on 64-bit hosts, as only one jump is required.
- *
- * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
- * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
- * to unroll. The code becomes ridiculously large (the largest function in the
- * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
- * also slightly faster because it fits into cache better and is more likely
- * to be inlined by the compiler.
- *
- * If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
- */
-#ifndef XXH_REROLL_XXH64
-#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
-   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
-     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
-     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
-     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
-   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
-#    define XXH_REROLL_XXH64 1
-#  else
-#    define XXH_REROLL_XXH64 0
-#  endif
-#endif /* !defined(XXH_REROLL_XXH64) */
-
 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
 /*
  * Manual byteshift. Best for old compilers which don't inline memcpy.
@@ -1604,7 +2300,10 @@ typedef XXH64_hash_t xxh_u64;
 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
 
 /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
-static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
 
 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
 
@@ -1627,12 +2326,12 @@ static xxh_u64 XXH_read64(const void* ptr)
 
 /*
  * Portable and safe solution. Generally efficient.
- * see: https://stackoverflow.com/a/32095106/646947
+ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
  */
 static xxh_u64 XXH_read64(const void* memPtr)
 {
     xxh_u64 val;
-    memcpy(&val, memPtr, sizeof(val));
+    XXH_memcpy(&val, memPtr, sizeof(val));
     return val;
 }
 
@@ -1643,7 +2342,7 @@ static xxh_u64 XXH_read64(const void* memPtr)
 #elif XXH_GCC_VERSION >= 403
 #  define XXH_swap64 __builtin_bswap64
 #else
-static xxh_u64 XXH_swap64 (xxh_u64 x)
+static xxh_u64 XXH_swap64(xxh_u64 x)
 {
     return  ((x << 56) & 0xff00000000000000ULL) |
             ((x << 40) & 0x00ff000000000000ULL) |
@@ -1709,12 +2408,18 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align)
 
 
 /*******   xxh64   *******/
-
-static const xxh_u64 XXH_PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
-static const xxh_u64 XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
-static const xxh_u64 XXH_PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
-static const xxh_u64 XXH_PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
-static const xxh_u64 XXH_PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+/*!
+ * @}
+ * @defgroup xxh64_impl XXH64 implementation
+ * @ingroup impl
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
 
 #ifdef XXH_OLD_NAMES
 #  define PRIME64_1 XXH_PRIME64_1
@@ -1756,126 +2461,27 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64)
 static xxh_u64
 XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
 {
-#define XXH_PROCESS1_64 do {                                   \
-    h64 ^= (*ptr++) * XXH_PRIME64_5;                           \
-    h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;                 \
-} while (0)
-
-#define XXH_PROCESS4_64 do {                                   \
-    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;      \
-    ptr += 4;                                              \
-    h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;     \
-} while (0)
-
-#define XXH_PROCESS8_64 do {                                   \
-    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
-    ptr += 8;                                              \
-    h64 ^= k1;                                             \
-    h64  = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;     \
-} while (0)
-
-    /* Rerolled version for 32-bit targets is faster and much smaller. */
-    if (XXH_REROLL || XXH_REROLL_XXH64) {
-        len &= 31;
-        while (len >= 8) {
-            XXH_PROCESS8_64;
-            len -= 8;
-        }
-        if (len >= 4) {
-            XXH_PROCESS4_64;
-            len -= 4;
-        }
-        while (len > 0) {
-            XXH_PROCESS1_64;
-            --len;
-        }
-         return  XXH64_avalanche(h64);
-    } else {
-        switch(len & 31) {
-           case 24: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 16: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case  8: XXH_PROCESS8_64;
-                    return XXH64_avalanche(h64);
-
-           case 28: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 20: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 12: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case  4: XXH_PROCESS4_64;
-                    return XXH64_avalanche(h64);
-
-           case 25: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 17: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case  9: XXH_PROCESS8_64;
-                    XXH_PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 29: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 21: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 13: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case  5: XXH_PROCESS4_64;
-                    XXH_PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 26: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 18: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 10: XXH_PROCESS8_64;
-                    XXH_PROCESS1_64;
-                    XXH_PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 30: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 22: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 14: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case  6: XXH_PROCESS4_64;
-                    XXH_PROCESS1_64;
-                    XXH_PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 27: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 19: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 11: XXH_PROCESS8_64;
-                    XXH_PROCESS1_64;
-                    XXH_PROCESS1_64;
-                    XXH_PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 31: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 23: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case 15: XXH_PROCESS8_64;
-                         /* fallthrough */
-           case  7: XXH_PROCESS4_64;
-                         /* fallthrough */
-           case  3: XXH_PROCESS1_64;
-                         /* fallthrough */
-           case  2: XXH_PROCESS1_64;
-                         /* fallthrough */
-           case  1: XXH_PROCESS1_64;
-                         /* fallthrough */
-           case  0: return XXH64_avalanche(h64);
-        }
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        h64 ^= (*ptr++) * XXH_PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
+        --len;
     }
-    /* impossible to reach */
-    XXH_ASSERT(0);
-    return 0;  /* unreachable, but some compilers complain without it */
+    return  XXH64_avalanche(h64);
 }
 
 #ifdef XXH_OLD_NAMES
@@ -1891,18 +2497,12 @@ XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
 XXH_FORCE_INLINE xxh_u64
 XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
 {
-    const xxh_u8* bEnd = input + len;
     xxh_u64 h64;
-
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
-    if (input==NULL) {
-        len=0;
-        bEnd=input=(const xxh_u8*)(size_t)32;
-    }
-#endif
+    if (input==NULL) XXH_ASSERT(len == 0);
 
     if (len>=32) {
-        const xxh_u8* const limit = bEnd - 32;
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 31;
         xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
         xxh_u64 v2 = seed + XXH_PRIME64_2;
         xxh_u64 v3 = seed + 0;
@@ -1913,7 +2513,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
             v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
             v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
             v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
-        } while (input<=limit);
+        } while (input<limit);
 
         h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
         h64 = XXH64_mergeRound(h64, v1);
@@ -1931,6 +2531,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
 }
 
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
 {
 #if 0
@@ -1939,9 +2540,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t s
     XXH64_reset(&state, seed);
     XXH64_update(&state, (const xxh_u8*)input, len);
     return XXH64_digest(&state);
-
 #else
-
     if (XXH_FORCE_ALIGN_CHECK) {
         if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
             return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
@@ -1954,43 +2553,46 @@ XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t s
 
 /*******   Hash Streaming   *******/
 
+/*! @ingroup xxh64_family*/
 XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
 {
     return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
 }
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
 {
     XXH_free(statePtr);
     return XXH_OK;
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
 {
-    memcpy(dstState, srcState, sizeof(*dstState));
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
 {
     XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
     memset(&state, 0, sizeof(state));
-    state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
-    state.v2 = seed + XXH_PRIME64_2;
-    state.v3 = seed + 0;
-    state.v4 = seed - XXH_PRIME64_1;
+    state.v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    state.v[1] = seed + XXH_PRIME64_2;
+    state.v[2] = seed + 0;
+    state.v[3] = seed - XXH_PRIME64_1;
      /* do not write into reserved64, might be removed in a future version */
-    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    XXH_memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
     return XXH_OK;
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH64_update (XXH64_state_t* state, const void* input, size_t len)
 {
-    if (input==NULL)
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
         return XXH_OK;
-#else
-        return XXH_ERROR;
-#endif
+    }
 
     {   const xxh_u8* p = (const xxh_u8*)input;
         const xxh_u8* const bEnd = p + len;
@@ -2005,32 +2607,24 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
 
         if (state->memsize) {   /* tmp buffer is full */
             XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
-            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
-            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
-            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
-            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
-            p += 32-state->memsize;
+            state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
+            state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
+            state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
+            state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
+            p += 32 - state->memsize;
             state->memsize = 0;
         }
 
         if (p+32 <= bEnd) {
             const xxh_u8* const limit = bEnd - 32;
-            xxh_u64 v1 = state->v1;
-            xxh_u64 v2 = state->v2;
-            xxh_u64 v3 = state->v3;
-            xxh_u64 v4 = state->v4;
 
             do {
-                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
-                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
-                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
-                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+                state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
+                state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
+                state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
+                state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
             } while (p<=limit);
 
-            state->v1 = v1;
-            state->v2 = v2;
-            state->v3 = v3;
-            state->v4 = v4;
         }
 
         if (p < bEnd) {
@@ -2043,23 +2637,19 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
 }
 
 
-XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
 {
     xxh_u64 h64;
 
     if (state->total_len >= 32) {
-        xxh_u64 const v1 = state->v1;
-        xxh_u64 const v2 = state->v2;
-        xxh_u64 const v3 = state->v3;
-        xxh_u64 const v4 = state->v4;
-
-        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
-        h64 = XXH64_mergeRound(h64, v1);
-        h64 = XXH64_mergeRound(h64, v2);
-        h64 = XXH64_mergeRound(h64, v3);
-        h64 = XXH64_mergeRound(h64, v4);
+        h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
+        h64 = XXH64_mergeRound(h64, state->v[0]);
+        h64 = XXH64_mergeRound(h64, state->v[1]);
+        h64 = XXH64_mergeRound(h64, state->v[2]);
+        h64 = XXH64_mergeRound(h64, state->v[3]);
     } else {
-        h64  = state->v3 /*seed*/ + XXH_PRIME64_5;
+        h64  = state->v[2] /*seed*/ + XXH_PRIME64_5;
     }
 
     h64 += (xxh_u64) state->total_len;
@@ -2070,28 +2660,35 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
 
 /******* Canonical representation   *******/
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
 {
     XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
     if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
-    memcpy(dst, &hash, sizeof(*dst));
+    XXH_memcpy(dst, &hash, sizeof(*dst));
 }
 
+/*! @ingroup xxh64_family */
 XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
 {
     return XXH_readBE64(src);
 }
 
-
+#ifndef XXH_NO_XXH3
 
 /* *********************************************************************
 *  XXH3
 *  New generation hash designed for speed on small keys and vectorization
 ************************************************************************ */
+/*!
+ * @}
+ * @defgroup xxh3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
 
 /* ===   Compiler specifics   === */
 
-/* Patch from https://github.com/Cyan4973/xxHash/pull/498 */
 #if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
 #  define XXH_RESTRICT /* disable */
 #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
@@ -2201,12 +2798,62 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
 /* ==========================================
  * Vectorization detection
  * ========================================== */
-#define XXH_SCALAR 0  /* Portable scalar version */
-#define XXH_SSE2   1  /* SSE2 for Pentium 4 and all x86_64 */
-#define XXH_AVX2   2  /* AVX2 for Haswell and Bulldozer */
-#define XXH_AVX512 3  /* AVX512 for Skylake and Icelake */
-#define XXH_NEON   4  /* NEON for most ARMv7-A and all AArch64 */
-#define XXH_VSX    5  /* VSX and ZVector for POWER8/z13 */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD or any of the values mentioned in
+ * @ref XXH_VECTOR_TYPE.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * @ref XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+    XXH_SCALAR = 0,  /*!< Portable scalar version */
+    XXH_SSE2   = 1,  /*!<
+                      * SSE2 for Pentium 4, Opteron, all x86_64.
+                      *
+                      * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                      * Android x86.
+                      */
+    XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
+    XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
+    XXH_NEON   = 4,  /*!< NEON for most ARMv7-A and all AArch64 */
+    XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+};
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment reqired for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#  define XXH_SCALAR 0
+#  define XXH_SSE2   1
+#  define XXH_AVX2   2
+#  define XXH_AVX512 3
+#  define XXH_NEON   4
+#  define XXH_VSX    5
+#endif
 
 #ifndef XXH_VECTOR    /* can be defined on command line */
 #  if defined(__AVX512F__)
@@ -2215,10 +2862,13 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
 #    define XXH_VECTOR XXH_AVX2
 #  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
 #    define XXH_VECTOR XXH_SSE2
-#  elif defined(__GNUC__) /* msvc support maybe later */ \
-  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
-  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
-    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#  elif ( \
+        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
+     || defined(_M_ARM64) || defined(_M_ARM_ARMV7VE) /* msvc */ \
+   ) && ( \
+        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   )
 #    define XXH_VECTOR XXH_NEON
 #  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
      || (defined(__s390x__) && defined(__VEC__)) \
@@ -2359,7 +3009,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
  * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
  */
 
-/*
+/*!
  * Function-like macro:
  * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
  * {
@@ -2370,7 +3020,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
  */
 # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
    && defined(__GNUC__) \
-   && !defined(__aarch64__) && !defined(__arm64__)
+   && !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64)
 #  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
     do {                                                                                    \
       /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
@@ -2432,10 +3082,12 @@ typedef __vector unsigned xxh_u32x4;
 # endif /* !defined(XXH_VSX_BE) */
 
 # if XXH_VSX_BE
-/* A wrapper for POWER9's vec_revb. */
 #  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
 #    define XXH_vec_revb vec_revb
 #  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
 {
     xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
@@ -2445,13 +3097,13 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
 #  endif
 # endif /* XXH_VSX_BE */
 
-/*
- * Performs an unaligned load and byte swaps it on big endian.
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
  */
 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
 {
     xxh_u64x2 ret;
-    memcpy(&ret, ptr, sizeof(xxh_u64x2));
+    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
 # if XXH_VSX_BE
     ret = XXH_vec_revb(ret);
 # endif
@@ -2474,7 +3126,7 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
 #  define XXH_vec_mule __builtin_altivec_vmuleuw
 # else
 /* gcc needs inline assembly */
-/* Adapted from https://github.com/google/highwayhash/blob/main/highwayhash/hh_vsx.h. */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
 {
     xxh_u64x2 result;
@@ -2496,7 +3148,7 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
 #if defined(XXH_NO_PREFETCH)
 #  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
 #else
-#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
 #    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
@@ -2517,7 +3169,7 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
 #  error "default keyset is not large enough"
 #endif
 
-/* Pseudorandom secret taken directly from FARSH */
+/*! Pseudorandom secret taken directly from FARSH. */
 XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
     0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
@@ -2538,23 +3190,29 @@ XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
 #  define kSecret XXH3_kSecret
 #endif
 
-/*
- * Calculates a 32-bit to 64-bit long multiply.
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
  *
- * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
  * need to (but it shouldn't need to anyways, it is about 7 instructions to do
- * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
  * use that instead of the normal method.
  *
  * If you are compiling for platforms like Thumb-1 and don't have a better option,
  * you may also want to write your own long multiply routine here.
  *
- * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
- * {
- *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
- * }
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
  */
-#if defined(_MSC_VER) && defined(_M_IX86)
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
 #    include <intrin.h>
 #    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
 #else
@@ -2568,10 +3226,14 @@ XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
 #    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
 #endif
 
-/*
- * Calculates a 64->128-bit long multiply.
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
  *
- * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs , rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
  */
 static XXH128_hash_t
 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
@@ -2620,6 +3282,21 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
     r128.high64 = product_high;
     return r128;
 
+    /*
+     * MSVC for ARM64's __umulh method.
+     *
+     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
+     */
+#elif defined(_M_ARM64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(__umulh)
+#endif
+    XXH128_hash_t r128;
+    r128.low64  = lhs * rhs;
+    r128.high64 = __umulh(lhs, rhs);
+    return r128;
+
 #else
     /*
      * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
@@ -2682,11 +3359,15 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
 #endif
 }
 
-/*
- * Does a 64-bit to 128-bit multiply, then XOR folds it.
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
  *
  * The reason for the separate function is to prevent passing too many structs
  * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs , rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
  */
 static xxh_u64
 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
@@ -2695,7 +3376,7 @@ XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
     return product.low64 ^ product.high64;
 }
 
-/* Seems to produce slightly better code on GCC for some reason. */
+/*! Seems to produce slightly better code on GCC for some reason. */
 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
 {
     XXH_ASSERT(0 <= shift && shift < 64);
@@ -2790,7 +3471,7 @@ XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h
 {
     XXH_ASSERT(input != NULL);
     XXH_ASSERT(secret != NULL);
-    XXH_ASSERT(4 <= len && len < 8);
+    XXH_ASSERT(4 <= len && len <= 8);
     seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
     {   xxh_u32 const input1 = XXH_readLE32(input);
         xxh_u32 const input2 = XXH_readLE32(input + len - 4);
@@ -2806,7 +3487,7 @@ XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_
 {
     XXH_ASSERT(input != NULL);
     XXH_ASSERT(secret != NULL);
-    XXH_ASSERT(8 <= len && len <= 16);
+    XXH_ASSERT(9 <= len && len <= 16);
     {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
         xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
         xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
@@ -2876,7 +3557,7 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
      * GCC generates much better scalar code than Clang for the rest of XXH3,
      * which is why finding a more optimal codepath is an interest.
      */
-    __asm__ ("" : "+r" (seed64));
+    XXH_COMPILER_GUARD(seed64);
 #endif
     {   xxh_u64 const input_lo = XXH_readLE64(input);
         xxh_u64 const input_hi = XXH_readLE64(input+8);
@@ -2986,7 +3667,7 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
 {
     if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
-    memcpy(dst, &v64, sizeof(v64));
+    XXH_memcpy(dst, &v64, sizeof(v64));
 }
 
 /* Several intrinsic functions below are supposed to accept __int64 as argument,
@@ -3026,7 +3707,8 @@ XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
  * Both XXH3_64bits and XXH3_128bits use this subroutine.
  */
 
-#if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH)
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
 
 #ifndef XXH_TARGET_AVX512
 # define XXH_TARGET_AVX512  /* disable attribute target */
@@ -3037,7 +3719,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
                      const void* XXH_RESTRICT input,
                      const void* XXH_RESTRICT secret)
 {
-    XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
+    __m512i* const xacc = (__m512i *) acc;
     XXH_ASSERT((((size_t)acc) & 63) == 0);
     XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
 
@@ -3086,7 +3768,7 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
 {
     XXH_ASSERT((((size_t)acc) & 63) == 0);
     XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
-    {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
+    {   __m512i* const xacc = (__m512i*) acc;
         const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
 
         /* xacc[0] ^= (xacc[0] >> 47) */
@@ -3113,17 +3795,19 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
     XXH_ASSERT(((size_t)customSecret & 63) == 0);
     (void)(&XXH_writeLE64);
     {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
-        __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
+        __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
 
-        XXH_ALIGN(64) const __m512i* const src  = (const __m512i*) XXH3_kSecret;
-        XXH_ALIGN(64)       __m512i* const dest = (      __m512i*) customSecret;
+        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
+              __m512i* const dest = (      __m512i*) customSecret;
         int i;
+        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 63) == 0);
         for (i=0; i < nbRounds; ++i) {
             /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
-             * this will warn "discards ‘const’ qualifier". */
+             * this will warn "discards 'const' qualifier". */
             union {
-                XXH_ALIGN(64) const __m512i* cp;
-                XXH_ALIGN(64) void* p;
+                const __m512i* cp;
+                void* p;
             } remote_const_void;
             remote_const_void.cp = src + i;
             dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
@@ -3132,7 +3816,8 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
 
 #endif
 
-#if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH)
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
 
 #ifndef XXH_TARGET_AVX2
 # define XXH_TARGET_AVX2  /* disable attribute target */
@@ -3144,7 +3829,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
                     const void* XXH_RESTRICT secret)
 {
     XXH_ASSERT((((size_t)acc) & 31) == 0);
-    {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
+    {   __m256i* const xacc    =       (__m256i *) acc;
         /* Unaligned. This is mainly for pointer arithmetic, and because
          * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
         const         __m256i* const xinput  = (const __m256i *) input;
@@ -3176,7 +3861,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void
 XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
 {
     XXH_ASSERT((((size_t)acc) & 31) == 0);
-    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+    {   __m256i* const xacc = (__m256i*) acc;
         /* Unaligned. This is mainly for pointer arithmetic, and because
          * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
         const         __m256i* const xsecret = (const __m256i *) secret;
@@ -3208,23 +3893,21 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR
     XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
     (void)(&XXH_writeLE64);
     XXH_PREFETCH(customSecret);
-    {   __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, -(xxh_i64)seed64, (xxh_i64)seed64);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
 
-        XXH_ALIGN(64) const __m256i* const src  = (const __m256i*) XXH3_kSecret;
-        XXH_ALIGN(64)       __m256i*       dest = (      __m256i*) customSecret;
+        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
+              __m256i*       dest = (      __m256i*) customSecret;
 
 #       if defined(__GNUC__) || defined(__clang__)
         /*
          * On GCC & Clang, marking 'dest' as modified will cause the compiler:
          *   - do not extract the secret from sse registers in the internal loop
          *   - use less common registers, and avoid pushing these reg into stack
-         * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
-         * customSecret, and on aarch64, this prevented LDP from merging two
-         * loads together for free. Putting the loads together before the stores
-         * properly generates LDP.
          */
-        __asm__("" : "+r" (dest));
+        XXH_COMPILER_GUARD(dest);
 #       endif
+        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 31) == 0);
 
         /* GCC -O2 need unroll loop manually */
         dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
@@ -3238,6 +3921,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR
 
 #endif
 
+/* x86dispatch always generates SSE2 */
 #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
 
 #ifndef XXH_TARGET_SSE2
@@ -3251,7 +3935,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
 {
     /* SSE2 is just a half-scale version of the AVX2 version. */
     XXH_ASSERT((((size_t)acc) & 15) == 0);
-    {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
+    {   __m128i* const xacc    =       (__m128i *) acc;
         /* Unaligned. This is mainly for pointer arithmetic, and because
          * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
         const         __m128i* const xinput  = (const __m128i *) input;
@@ -3283,7 +3967,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void
 XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
 {
     XXH_ASSERT((((size_t)acc) & 15) == 0);
-    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+    {   __m128i* const xacc = (__m128i*) acc;
         /* Unaligned. This is mainly for pointer arithmetic, and because
          * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
         const         __m128i* const xsecret = (const __m128i *) secret;
@@ -3315,27 +3999,29 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR
     {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
 
 #       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-        // MSVC 32bit mode does not support _mm_set_epi64x before 2015
-        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, -(xxh_i64)seed64 };
+        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
+        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
         __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
 #       else
-        __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
 #       endif
         int i;
 
-        XXH_ALIGN(64)        const float* const src  = (float const*) XXH3_kSecret;
-        XXH_ALIGN(XXH_SEC_ALIGN) __m128i*       dest = (__m128i*) customSecret;
+        const void* const src16 = XXH3_kSecret;
+        __m128i* dst16 = (__m128i*) customSecret;
 #       if defined(__GNUC__) || defined(__clang__)
         /*
          * On GCC & Clang, marking 'dest' as modified will cause the compiler:
          *   - do not extract the secret from sse registers in the internal loop
          *   - use less common registers, and avoid pushing these reg into stack
          */
-        __asm__("" : "+r" (dest));
+        XXH_COMPILER_GUARD(dst16);
 #       endif
+        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dst16 & 15) == 0);
 
         for (i=0; i < nbRounds; ++i) {
-            dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed);
+            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
     }   }
 }
 
@@ -3350,7 +4036,7 @@ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
 {
     XXH_ASSERT((((size_t)acc) & 15) == 0);
     {
-        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        uint64x2_t* const xacc = (uint64x2_t *) acc;
         /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
         uint8_t const* const xinput = (const uint8_t *) input;
         uint8_t const* const xsecret  = (const uint8_t *) secret;
@@ -3397,8 +4083,8 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
             uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
 
             /* xacc[i] ^= xsecret[i]; */
-            uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
-            uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
+            uint8x16_t key_vec  = vld1q_u8    (xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64   (data_vec, vreinterpretq_u64_u8(key_vec));
 
             /* xacc[i] *= XXH_PRIME32_1 */
             uint32x2_t data_key_lo, data_key_hi;
@@ -3442,7 +4128,8 @@ XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
                     const void* XXH_RESTRICT input,
                     const void* XXH_RESTRICT secret)
 {
-          xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
+    /* presumed aligned */
+    unsigned long long* const xacc = (unsigned long long*) acc;
     xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
     xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
     xxh_u64x2 const v32 = { 32, 32 };
@@ -3457,14 +4144,18 @@ XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
         xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
         /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
         xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
-        xacc[i] += product;
+        /* acc_vec = xacc[i]; */
+        xxh_u64x2 acc_vec        = vec_xl(0, xacc + 2 * i);
+        acc_vec += product;
 
         /* swap high and low halves */
 #ifdef __s390x__
-        xacc[i] += vec_permi(data_vec, data_vec, 2);
+        acc_vec += vec_permi(data_vec, data_vec, 2);
 #else
-        xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
+        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
 #endif
+        /* xacc[i] = acc_vec; */
+        vec_xst(acc_vec, 0, xacc + 2 * i);
     }
 }
 
@@ -3507,7 +4198,7 @@ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
                      const void* XXH_RESTRICT input,
                      const void* XXH_RESTRICT secret)
 {
-    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+    xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
     const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
     size_t i;
@@ -3523,7 +4214,7 @@ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
 XXH_FORCE_INLINE void
 XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
 {
-    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
     size_t i;
     XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
@@ -3577,7 +4268,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
      *   without hack: 2654.4 MB/s
      *   with hack:    3202.9 MB/s
      */
-    __asm__("" : "+r" (kSecretPtr));
+    XXH_COMPILER_GUARD(kSecretPtr);
 #endif
     /*
      * Note: in debug mode, this overrides the asm optimization
@@ -3742,7 +4433,7 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre
          *   without hack: 2063.7 MB/s
          *   with hack:    2560.7 MB/s
          */
-        __asm__("" : "+r" (result64));
+        XXH_COMPILER_GUARD(result64);
 #endif
     }
 
@@ -3771,9 +4462,11 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
 }
 
 /*
- * It's important for performance that XXH3_hashLong is not inlined.
+ * It's important for performance to transmit secret's size (when it's static)
+ * so that the compiler can properly optimize the vectorized loop.
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
  */
-XXH_NO_INLINE XXH64_hash_t
+XXH_FORCE_INLINE XXH64_hash_t
 XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
                              XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
 {
@@ -3782,11 +4475,10 @@ XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
 }
 
 /*
- * It's important for performance that XXH3_hashLong is not inlined.
- * Since the function is not inlined, the compiler may not be able to understand that,
- * in some scenarios, its `secret` argument is actually a compile time constant.
- * This variant enforces that the compiler can detect that,
- * and uses this opportunity to streamline the generated code for better performance.
+ * It's preferable for performance that XXH3_hashLong is not inlined,
+ * as it results in a smaller function for small data, easier to the instruction cache.
+ * Note that inside this no_inline function, we do inline the internal loop,
+ * and provide a statically defined secret size to allow optimization of vector loop.
  */
 XXH_NO_INLINE XXH64_hash_t
 XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
@@ -3866,23 +4558,34 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
 
 /* ===   Public entry point   === */
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
 {
     return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t
 XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
 {
     return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t
 XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
 {
     return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
 }
 
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);
+}
+
 
 /* ===   XXH3 streaming   === */
 
@@ -3951,6 +4654,7 @@ static void XXH_alignedFree(void* p)
         XXH_free(base);
     }
 }
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
 {
     XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
@@ -3959,22 +4663,24 @@ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
     return state;
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
 {
     XXH_alignedFree(statePtr);
     return XXH_OK;
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API void
 XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
 {
-    memcpy(dst_state, src_state, sizeof(*dst_state));
+    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
 }
 
 static void
-XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
-                           XXH64_hash_t seed,
-                           const void* secret, size_t secretSize)
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                    XXH64_hash_t seed,
+                    const void* secret, size_t secretSize)
 {
     size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
     size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
@@ -3991,37 +4697,54 @@ XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
     statePtr->acc[6] = XXH_PRIME64_5;
     statePtr->acc[7] = XXH_PRIME32_1;
     statePtr->seed = seed;
+    statePtr->useSeed = (seed != 0);
     statePtr->extSecret = (const unsigned char*)secret;
     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
     statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
     statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH3_64bits_reset(XXH3_state_t* statePtr)
 {
     if (statePtr == NULL) return XXH_ERROR;
-    XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
     return XXH_OK;
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
 {
     if (statePtr == NULL) return XXH_ERROR;
-    XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
     if (secret == NULL) return XXH_ERROR;
     if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
     return XXH_OK;
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
 {
     if (statePtr == NULL) return XXH_ERROR;
     if (seed==0) return XXH3_64bits_reset(statePtr);
-    if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
-    XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
+        XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
+    statePtr->useSeed = 1; /* always, even if seed64==0 */
     return XXH_OK;
 }
 
@@ -4052,34 +4775,48 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
     }
 }
 
+#ifndef XXH3_STREAM_USE_STACK
+# ifndef __clang__ /* clang doesn't need additional stack space */
+#   define XXH3_STREAM_USE_STACK 1
+# endif
+#endif
 /*
  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
  */
 XXH_FORCE_INLINE XXH_errorcode
-XXH3_update(XXH3_state_t* state,
-            const xxh_u8* input, size_t len,
+XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
+            const xxh_u8* XXH_RESTRICT input, size_t len,
             XXH3_f_accumulate_512 f_acc512,
             XXH3_f_scrambleAcc f_scramble)
 {
-    if (input==NULL)
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
         return XXH_OK;
-#else
-        return XXH_ERROR;
-#endif
+    }
 
+    XXH_ASSERT(state != NULL);
     {   const xxh_u8* const bEnd = input + len;
         const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
-
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* For some reason, gcc and MSVC seem to suffer greatly
+         * when operating accumulators directly into state.
+         * Operating into stack space seems to enable proper optimization.
+         * clang, on the other hand, doesn't seem to need this trick */
+        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
+#else
+        xxh_u64* XXH_RESTRICT const acc = state->acc;
+#endif
         state->totalLen += len;
+        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
 
-        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+        /* small input : just fill in tmp buffer */
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
             XXH_memcpy(state->buffer + state->bufferedSize, input, len);
             state->bufferedSize += (XXH32_hash_t)len;
             return XXH_OK;
         }
-        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
 
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
         #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
         XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
 
@@ -4091,7 +4828,7 @@ XXH3_update(XXH3_state_t* state,
             size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
             XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
             input += loadSize;
-            XXH3_consumeStripes(state->acc,
+            XXH3_consumeStripes(acc,
                                &state->nbStripesSoFar, state->nbStripesPerBlock,
                                 state->buffer, XXH3_INTERNALBUFFER_STRIPES,
                                 secret, state->secretLimit,
@@ -4100,30 +4837,68 @@ XXH3_update(XXH3_state_t* state,
         }
         XXH_ASSERT(input < bEnd);
 
-        /* Consume input by a multiple of internal buffer size */
-        if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) {
-            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
-            do {
-                XXH3_consumeStripes(state->acc,
-                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
-                                    input, XXH3_INTERNALBUFFER_STRIPES,
-                                    secret, state->secretLimit,
-                                    f_acc512, f_scramble);
-                input += XXH3_INTERNALBUFFER_SIZE;
-            } while (input<limit);
-            /* for last partial stripe */
-            memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+        /* large input to consume : ingest per full block */
+        if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
+            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+            XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
+            /* join to current block's end */
+            {   size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
+                XXH_ASSERT(nbStripes <= nbStripes);
+                XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
+                f_scramble(acc, secret + state->secretLimit);
+                state->nbStripesSoFar = 0;
+                input += nbStripesToEnd * XXH_STRIPE_LEN;
+                nbStripes -= nbStripesToEnd;
+            }
+            /* consume per entire blocks */
+            while(nbStripes >= state->nbStripesPerBlock) {
+                XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
+                f_scramble(acc, secret + state->secretLimit);
+                input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
+                nbStripes -= state->nbStripesPerBlock;
+            }
+            /* consume last partial block */
+            XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
+            input += nbStripes * XXH_STRIPE_LEN;
+            XXH_ASSERT(input < bEnd);  /* at least some bytes left */
+            state->nbStripesSoFar = nbStripes;
+            /* buffer predecessor of last partial stripe */
+            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+            XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
+        } else {
+            /* content to consume <= block size */
+            /* Consume input by a multiple of internal buffer size */
+            if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+                const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+                do {
+                    XXH3_consumeStripes(acc,
+                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                        input, XXH3_INTERNALBUFFER_STRIPES,
+                                        secret, state->secretLimit,
+                                        f_acc512, f_scramble);
+                    input += XXH3_INTERNALBUFFER_SIZE;
+                } while (input<limit);
+                /* buffer predecessor of last partial stripe */
+                XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+            }
         }
-        XXH_ASSERT(input < bEnd);
 
         /* Some remaining input (always) : buffer it */
+        XXH_ASSERT(input < bEnd);
+        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+        XXH_ASSERT(state->bufferedSize == 0);
         XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
         state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* save stack accumulators into state */
+        memcpy(state->acc, acc, sizeof(acc));
+#endif
     }
 
     return XXH_OK;
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
 {
@@ -4141,7 +4916,7 @@ XXH3_digest_long (XXH64_hash_t* acc,
      * Digest on a local copy. This way, the state remains unaltered, and it can
      * continue ingesting more input afterwards.
      */
-    memcpy(acc, state->acc, sizeof(state->acc));
+    XXH_memcpy(acc, state->acc, sizeof(state->acc));
     if (state->bufferedSize >= XXH_STRIPE_LEN) {
         size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
         size_t nbStripesSoFar = state->nbStripesSoFar;
@@ -4158,14 +4933,15 @@ XXH3_digest_long (XXH64_hash_t* acc,
         xxh_u8 lastStripe[XXH_STRIPE_LEN];
         size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
         XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
-        memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
-        memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
         XXH3_accumulate_512(acc,
                             lastStripe,
                             secret + state->secretLimit - XXH_SECRET_LASTACC_START);
     }
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
 {
     const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
@@ -4177,57 +4953,13 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
                               (xxh_u64)state->totalLen * XXH_PRIME64_1);
     }
     /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
-    if (state->seed)
+    if (state->useSeed)
         return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
     return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
                                   secret, state->secretLimit + XXH_STRIPE_LEN);
 }
 
 
-#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
-
-XXH_PUBLIC_API void
-XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
-{
-    XXH_ASSERT(secretBuffer != NULL);
-    if (customSeedSize == 0) {
-        memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
-        return;
-    }
-    XXH_ASSERT(customSeed != NULL);
-
-    {   size_t const segmentSize = sizeof(XXH128_hash_t);
-        size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
-        XXH128_canonical_t scrambler;
-        XXH64_hash_t seeds[12];
-        size_t segnb;
-        XXH_ASSERT(nbSegments == 12);
-        XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
-        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
-
-        /*
-        * Copy customSeed to seeds[], truncating or repeating as necessary.
-        */
-        {   size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
-            size_t filled = toFill;
-            memcpy(seeds, customSeed, toFill);
-            while (filled < sizeof(seeds)) {
-                toFill = XXH_MIN(filled, sizeof(seeds) - filled);
-                memcpy((char*)seeds + filled, seeds, toFill);
-                filled += toFill;
-        }   }
-
-        /* generate secret */
-        memcpy(secretBuffer, &scrambler, sizeof(scrambler));
-        for (segnb=1; segnb < nbSegments; segnb++) {
-            size_t const segmentStart = segnb * segmentSize;
-            XXH128_canonical_t segment;
-            XXH128_canonicalFromHash(&segment,
-                XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
-            memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
-    }   }
-}
-
 
 /* ==========================================
  * XXH3 128 bits (a.k.a XXH128)
@@ -4529,9 +5261,10 @@ XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
 }
 
 /*
- * It's important for performance that XXH3_hashLong is not inlined.
+ * It's important for performance to pass @secretLen (when it's static)
+ * to the compiler, so that it can properly optimize the vectorized loop.
  */
-XXH_NO_INLINE XXH128_hash_t
+XXH_FORCE_INLINE XXH128_hash_t
 XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
                               XXH64_hash_t seed64,
                               const void* XXH_RESTRICT secret, size_t secretLen)
@@ -4598,6 +5331,7 @@ XXH3_128bits_internal(const void* input, size_t len,
 
 /* ===   Public XXH128 API   === */
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
 {
     return XXH3_128bits_internal(input, len, 0,
@@ -4605,6 +5339,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
                                  XXH3_hashLong_128b_default);
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t
 XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
 {
@@ -4613,6 +5348,7 @@ XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_
                                  XXH3_hashLong_128b_withSecret);
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t
 XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
 {
@@ -4621,6 +5357,16 @@ XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
                                  XXH3_hashLong_128b_withSeed);
 }
 
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+}
+
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t
 XXH128(const void* input, size_t len, XXH64_hash_t seed)
 {
@@ -4631,46 +5377,39 @@ XXH128(const void* input, size_t len, XXH64_hash_t seed)
 /* ===   XXH3 128-bit streaming   === */
 
 /*
- * All the functions are actually the same as for 64-bit streaming variant.
- * The only difference is the finalizatiom routine.
+ * All initialization and update functions are identical to 64-bit streaming variant.
+ * The only difference is the finalization routine.
  */
 
-static void
-XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
-                            XXH64_hash_t seed,
-                            const void* secret, size_t secretSize)
-{
-    XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
-}
-
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH3_128bits_reset(XXH3_state_t* statePtr)
 {
-    if (statePtr == NULL) return XXH_ERROR;
-    XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
-    return XXH_OK;
+    return XXH3_64bits_reset(statePtr);
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
 {
-    if (statePtr == NULL) return XXH_ERROR;
-    XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
-    if (secret == NULL) return XXH_ERROR;
-    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
-    return XXH_OK;
+    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
 {
-    if (statePtr == NULL) return XXH_ERROR;
-    if (seed==0) return XXH3_128bits_reset(statePtr);
-    if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
-    XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
-    return XXH_OK;
+    return XXH3_64bits_reset_withSeed(statePtr, seed);
 }
 
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+}
+
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH_errorcode
 XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
 {
@@ -4678,6 +5417,7 @@ XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
                        XXH3_accumulate_512, XXH3_scrambleAcc);
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
 {
     const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
@@ -4708,6 +5448,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
 #include <string.h>   /* memcmp, memcpy */
 
 /* return : 1 is equal, 0 if different */
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
 {
     /* note : XXH128_hash_t is compact, it has no padding byte */
@@ -4718,6 +5459,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
  * return : >0 if *h128_1  > *h128_2
  *          <0 if *h128_1  < *h128_2
  *          =0 if *h128_1 == *h128_2  */
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
 {
     XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
@@ -4730,6 +5472,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
 
 
 /*======   Canonical representation   ======*/
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API void
 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
 {
@@ -4738,10 +5481,11 @@ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
         hash.high64 = XXH_swap64(hash.high64);
         hash.low64  = XXH_swap64(hash.low64);
     }
-    memcpy(dst, &hash.high64, sizeof(hash.high64));
-    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
+    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
 }
 
+/*! @ingroup xxh3_family */
 XXH_PUBLIC_API XXH128_hash_t
 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
 {
@@ -4751,6 +5495,69 @@ XXH128_hashFromCanonical(const XXH128_canonical_t* src)
     return h;
 }
 
+
+
+/* ==========================================
+ * Secret generators
+ * ==========================================
+ */
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+static void XXH3_combine16(void* dst, XXH128_hash_t h128)
+{
+    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
+    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
+{
+    XXH_ASSERT(secretBuffer != NULL);
+    if (secretBuffer == NULL) return XXH_ERROR;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    if (customSeedSize == 0) {
+        customSeed = XXH3_kSecret;
+        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
+    }
+    XXH_ASSERT(customSeed != NULL);
+    if (customSeed == NULL) return XXH_ERROR;
+
+    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
+    {   size_t pos = 0;
+        while (pos < secretSize) {
+            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
+            memcpy((char*)secretBuffer + pos, customSeed, toCopy);
+            pos += toCopy;
+    }   }
+
+    {   size_t const nbSeg16 = secretSize / 16;
+        size_t n;
+        XXH128_canonical_t scrambler;
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+        for (n=0; n<nbSeg16; n++) {
+            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
+            XXH3_combine16((char*)secretBuffer + n*16, h128);
+        }
+        /* last segment */
+        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
+    }
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
+{
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    XXH3_initCustomSecret(secret, seed);
+    XXH_ASSERT(secretBuffer != NULL);
+    memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
+}
+
+
+
 /* Pop our optimization override from above */
 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
   && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
@@ -4760,7 +5567,11 @@ XXH128_hashFromCanonical(const XXH128_canonical_t* src)
 
 #endif  /* XXH_NO_LONG_LONG */
 
+#endif  /* XXH_NO_XXH3 */
 
+/*!
+ * @}
+ */
 #endif  /* XXH_IMPLEMENTATION */
 
 
diff --git a/include/parquet/arrow/reader.h b/include/parquet/arrow/reader.h
index 5dff35e..2cbd361 100644
--- a/include/parquet/arrow/reader.h
+++ b/include/parquet/arrow/reader.h
@@ -87,7 +87,7 @@ class RowGroupReader;
 //   optional int32 val4;
 // }
 //
-// In the Parquet file, there are 3 leaf nodes:
+// In the Parquet file, there are 4 leaf nodes:
 //
 // * data.record.val1
 // * data.record.val2
diff --git a/include/parquet/bloom_filter.h b/include/parquet/bloom_filter.h
index b99e8bd..e8ef5c0 100644
--- a/include/parquet/bloom_filter.h
+++ b/include/parquet/bloom_filter.h
@@ -49,6 +49,11 @@ class PARQUET_EXPORT BloomFilter {
   /// @param hash the hash of value to insert into Bloom filter.
   virtual void InsertHash(uint64_t hash) = 0;
 
+  /// Insert elements to set represented by Bloom filter bitset.
+  /// @param hashes the hash values to insert into Bloom filter.
+  /// @param num_values the number of hash values to insert.
+  virtual void InsertHashes(const uint64_t* hashes, int num_values) = 0;
+
   /// Write this Bloom filter to an output stream. A Bloom filter structure should
   /// include bitset length, hash strategy, algorithm, and bitset.
   ///
@@ -101,7 +106,66 @@ class PARQUET_EXPORT BloomFilter {
   /// @return hash result.
   virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
 
-  virtual ~BloomFilter() {}
+  /// Batch compute hashes for 32 bits values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for 64 bits values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for float values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const float* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for double values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const double* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for Int96 values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const Int96* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for ByteArray values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const ByteArray* values, int num_values,
+                      uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for fixed byte array values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param type_len the value length.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const FLBA* values, uint32_t type_len, int num_values,
+                      uint64_t* hashes) const = 0;
+
+  virtual ~BloomFilter() = default;
 
  protected:
   // Hash strategy available for Bloom filter.
@@ -200,19 +264,48 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
 
   bool FindHash(uint64_t hash) const override;
   void InsertHash(uint64_t hash) override;
+  void InsertHashes(const uint64_t* hashes, int num_values) override;
   void WriteTo(ArrowOutputStream* sink) const override;
   uint32_t GetBitsetSize() const override { return num_bytes_; }
 
+  uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
   uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
   uint64_t Hash(float value) const override { return hasher_->Hash(value); }
   uint64_t Hash(double value) const override { return hasher_->Hash(value); }
   uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
   uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
-  uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
   uint64_t Hash(const FLBA* value, uint32_t len) const override {
     return hasher_->Hash(value, len);
   }
 
+  void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const float* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const double* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const Int96* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const ByteArray* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const FLBA* values, uint32_t type_len, int num_values,
+              uint64_t* hashes) const override {
+    hasher_->Hashes(values, type_len, num_values, hashes);
+  }
+
+  uint64_t Hash(const int32_t* value) const { return hasher_->Hash(*value); }
+  uint64_t Hash(const int64_t* value) const { return hasher_->Hash(*value); }
+  uint64_t Hash(const float* value) const { return hasher_->Hash(*value); }
+  uint64_t Hash(const double* value) const { return hasher_->Hash(*value); }
+
   /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
   /// a Bloom filter from a parquet filter.
   ///
@@ -223,6 +316,8 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
                                            ArrowInputStream* input_stream);
 
  private:
+  inline void InsertHashImpl(uint64_t hash);
+
   // Bytes in a tiny Bloom filter block.
   static constexpr int kBytesPerFilterBlock = 32;
 
@@ -240,11 +335,6 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
       0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
       0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
 
-  /// Set bits in mask array according to input key.
-  /// @param key the value to calculate mask values.
-  /// @param mask the mask array is used to set inside a block
-  void SetMask(uint32_t key, BlockMask& mask) const;
-
   // Memory pool to allocate aligned buffer for bitset
   ::arrow::MemoryPool* pool_;
 
diff --git a/include/parquet/column_reader.h b/include/parquet/column_reader.h
index 0e6c091..334b8bc 100644
--- a/include/parquet/column_reader.h
+++ b/include/parquet/column_reader.h
@@ -152,6 +152,9 @@ class PARQUET_EXPORT PageReader {
 
   // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
   // containing new Page otherwise
+  //
+  // The returned Page may contain references that aren't guaranteed to live
+  // beyond the next call to NextPage().
   virtual std::shared_ptr<Page> NextPage() = 0;
 
   virtual void set_max_page_header_size(uint32_t size) = 0;
@@ -384,7 +387,7 @@ class PARQUET_EXPORT RecordReader {
   /// If this Reader was constructed with read_dense_for_nullable(), there is no space for
   /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For
   /// FLBA and ByteArray types this value reflects the values written with the last
-  /// ReadRecords call since thoser readers will reset the values after each call.
+  /// ReadRecords call since those readers will reset the values after each call.
   int64_t values_written() const { return values_written_; }
 
   /// \brief Number of definition / repetition levels (from those that have
diff --git a/include/parquet/hasher.h b/include/parquet/hasher.h
index d699356..519eb45 100644
--- a/include/parquet/hasher.h
+++ b/include/parquet/hasher.h
@@ -66,6 +66,65 @@ class Hasher {
   /// @param len the value length.
   virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
 
+  /// Batch compute hashes for 32 bits values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for 64 bits values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for float values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const float* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for double values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const double* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for Int96 values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const Int96* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for ByteArray values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const ByteArray* values, int num_values,
+                      uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for fixed byte array values by using its plain encoding result.
+  ///
+  /// @param values the value address.
+  /// @param type_len the value length.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const FLBA* values, uint32_t type_len, int num_values,
+                      uint64_t* hashes) const = 0;
+
   virtual ~Hasher() = default;
 };
 
diff --git a/include/parquet/level_conversion.h b/include/parquet/level_conversion.h
index dc3b3b6..480d82e 100644
--- a/include/parquet/level_conversion.h
+++ b/include/parquet/level_conversion.h
@@ -151,7 +151,7 @@ struct PARQUET_EXPORT ValidityBitmapInputOutput {
   int64_t values_read = 0;
   // Input/Output. The number of nulls encountered.
   int64_t null_count = 0;
-  // Output only. The validity bitmap to populate. May be be null only
+  // Output only. The validity bitmap to populate. Maybe be null only
   // for DefRepLevelsToListInfo (if all that is needed is list offsets).
   uint8_t* valid_bits = NULLPTR;
   // Input only, offset into valid_bits to start at.
diff --git a/include/parquet/metadata.h b/include/parquet/metadata.h
index 620bc84..e62b2d1 100644
--- a/include/parquet/metadata.h
+++ b/include/parquet/metadata.h
@@ -251,6 +251,8 @@ class PARQUET_EXPORT RowGroupMetaData {
   const SchemaDescriptor* schema() const;
   // Indicate if all of the RowGroup's ColumnChunks can be decompressed.
   bool can_decompress() const;
+  // Sorting columns of the row group if any.
+  std::vector<SortingColumn> sorting_columns() const;
 
  private:
   explicit RowGroupMetaData(
diff --git a/include/parquet/parquet_version.h b/include/parquet/parquet_version.h
index 1eea5a7..ee84705 100644
--- a/include/parquet/parquet_version.h
+++ b/include/parquet/parquet_version.h
@@ -18,14 +18,14 @@
 #ifndef PARQUET_VERSION_H
 #define PARQUET_VERSION_H
 
-#define PARQUET_VERSION_MAJOR 12
+#define PARQUET_VERSION_MAJOR 13
 #define PARQUET_VERSION_MINOR 0
-#define PARQUET_VERSION_PATCH 1
+#define PARQUET_VERSION_PATCH 0
 
-#define PARQUET_SO_VERSION "1200"
-#define PARQUET_FULL_SO_VERSION "1200.1.0"
+#define PARQUET_SO_VERSION "1300"
+#define PARQUET_FULL_SO_VERSION "1300.0.0"
 
 // define the parquet created by version
-#define CREATED_BY_VERSION "parquet-cpp-arrow version 12.0.1"
+#define CREATED_BY_VERSION "parquet-cpp-arrow version 13.0.0"
 
 #endif  // PARQUET_VERSION_H
diff --git a/include/parquet/properties.h b/include/parquet/properties.h
index d892788..c195ab8 100644
--- a/include/parquet/properties.h
+++ b/include/parquet/properties.h
@@ -138,6 +138,7 @@ static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
 static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
 static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
 static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
+static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
 
 class PARQUET_EXPORT ColumnProperties {
  public:
@@ -145,13 +146,15 @@ class PARQUET_EXPORT ColumnProperties {
                    Compression::type codec = DEFAULT_COMPRESSION_TYPE,
                    bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
                    bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
-                   size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
+                   size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE,
+                   bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED)
       : encoding_(encoding),
         codec_(codec),
         dictionary_enabled_(dictionary_enabled),
         statistics_enabled_(statistics_enabled),
         max_stats_size_(max_stats_size),
-        compression_level_(Codec::UseDefaultCompressionLevel()) {}
+        compression_level_(Codec::UseDefaultCompressionLevel()),
+        page_index_enabled_(page_index_enabled) {}
 
   void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
 
@@ -173,6 +176,10 @@ class PARQUET_EXPORT ColumnProperties {
     compression_level_ = compression_level;
   }
 
+  void set_page_index_enabled(bool page_index_enabled) {
+    page_index_enabled_ = page_index_enabled;
+  }
+
   Encoding::type encoding() const { return encoding_; }
 
   Compression::type compression() const { return codec_; }
@@ -185,6 +192,8 @@ class PARQUET_EXPORT ColumnProperties {
 
   int compression_level() const { return compression_level_; }
 
+  bool page_index_enabled() const { return page_index_enabled_; }
+
  private:
   Encoding::type encoding_;
   Compression::type codec_;
@@ -192,6 +201,7 @@ class PARQUET_EXPORT ColumnProperties {
   bool statistics_enabled_;
   size_t max_stats_size_;
   int compression_level_;
+  bool page_index_enabled_;
 };
 
 class PARQUET_EXPORT WriterProperties {
@@ -204,12 +214,11 @@ class PARQUET_EXPORT WriterProperties {
           write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
           max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
           pagesize_(kDefaultDataPageSize),
-          version_(ParquetVersion::PARQUET_2_4),
+          version_(ParquetVersion::PARQUET_2_6),
           data_page_version_(ParquetDataPageVersion::V1),
           created_by_(DEFAULT_CREATED_BY),
           store_decimal_as_integer_(false),
-          page_checksum_enabled_(false),
-          write_page_index_(false) {}
+          page_checksum_enabled_(false) {}
     virtual ~Builder() {}
 
     /// Specify the memory pool for the writer. Default default_memory_pool.
@@ -287,7 +296,7 @@ class PARQUET_EXPORT WriterProperties {
     }
 
     /// Specify the Parquet file version.
-    /// Default PARQUET_2_4.
+    /// Default PARQUET_2_6.
     Builder* version(ParquetVersion::type version) {
       version_ = version;
       return this;
@@ -457,6 +466,17 @@ class PARQUET_EXPORT WriterProperties {
       return this->enable_statistics(path->ToDotString());
     }
 
+    /// Define the sorting columns.
+    /// Default empty.
+    ///
+    /// If sorting columns are set, user should ensure that records
+    /// are sorted by sorting columns. Otherwise, the storing data
+    /// will be inconsistent with sorting_columns metadata.
+    Builder* set_sorting_columns(std::vector<SortingColumn> sorting_columns) {
+      sorting_columns_ = std::move(sorting_columns);
+      return this;
+    }
+
     /// Disable statistics for the column specified by `path`.
     /// Default enabled.
     Builder* disable_statistics(const std::string& path) {
@@ -502,28 +522,49 @@ class PARQUET_EXPORT WriterProperties {
       return this;
     }
 
-    /// Enable writing page index.
+    /// Enable writing page index in general for all columns. Default disabled.
     ///
-    /// Page index contains statistics for data pages and can be used to skip pages
-    /// when scanning data in ordered and unordered columns.
+    /// Writing statistics to the page index disables the old method of writing
+    /// statistics to each data page header.
+    /// The page index makes filtering more efficient than the page header, as
+    /// it gathers all the statistics for a Parquet file in a single place,
+    /// avoiding scattered I/O.
     ///
     /// Please check the link below for more details:
     /// https://github.com/apache/parquet-format/blob/master/PageIndex.md
-    ///
-    /// Default disabled.
     Builder* enable_write_page_index() {
-      write_page_index_ = true;
+      default_column_properties_.set_page_index_enabled(true);
       return this;
     }
 
-    /// Disable writing page index.
-    ///
-    /// Default disabled.
+    /// Disable writing page index in general for all columns. Default disabled.
     Builder* disable_write_page_index() {
-      write_page_index_ = false;
+      default_column_properties_.set_page_index_enabled(false);
+      return this;
+    }
+
+    /// Enable writing page index for column specified by `path`. Default disabled.
+    Builder* enable_write_page_index(const std::string& path) {
+      page_index_enabled_[path] = true;
+      return this;
+    }
+
+    /// Enable writing page index for column specified by `path`. Default disabled.
+    Builder* enable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
+      return this->enable_write_page_index(path->ToDotString());
+    }
+
+    /// Disable writing page index for column specified by `path`. Default disabled.
+    Builder* disable_write_page_index(const std::string& path) {
+      page_index_enabled_[path] = false;
       return this;
     }
 
+    /// Disable writing page index for column specified by `path`. Default disabled.
+    Builder* disable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
+      return this->disable_write_page_index(path->ToDotString());
+    }
+
     /// \brief Build the WriterProperties with the builder parameters.
     /// \return The WriterProperties defined by the builder.
     std::shared_ptr<WriterProperties> build() {
@@ -544,13 +585,15 @@ class PARQUET_EXPORT WriterProperties {
         get(item.first).set_dictionary_enabled(item.second);
       for (const auto& item : statistics_enabled_)
         get(item.first).set_statistics_enabled(item.second);
+      for (const auto& item : page_index_enabled_)
+        get(item.first).set_page_index_enabled(item.second);
 
       return std::shared_ptr<WriterProperties>(new WriterProperties(
           pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
           pagesize_, version_, created_by_, page_checksum_enabled_,
           std::move(file_encryption_properties_), default_column_properties_,
           column_properties, data_page_version_, store_decimal_as_integer_,
-          write_page_index_));
+          std::move(sorting_columns_)));
     }
 
    private:
@@ -564,10 +607,12 @@ class PARQUET_EXPORT WriterProperties {
     std::string created_by_;
     bool store_decimal_as_integer_;
     bool page_checksum_enabled_;
-    bool write_page_index_;
 
     std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
 
+    // If empty, there is no sorting columns.
+    std::vector<SortingColumn> sorting_columns_;
+
     // Settings used for each column unless overridden in any of the maps below
     ColumnProperties default_column_properties_;
     std::unordered_map<std::string, Encoding::type> encodings_;
@@ -575,6 +620,7 @@ class PARQUET_EXPORT WriterProperties {
     std::unordered_map<std::string, int32_t> codecs_compression_level_;
     std::unordered_map<std::string, bool> dictionary_enabled_;
     std::unordered_map<std::string, bool> statistics_enabled_;
+    std::unordered_map<std::string, bool> page_index_enabled_;
   };
 
   inline MemoryPool* memory_pool() const { return pool_; }
@@ -599,8 +645,6 @@ class PARQUET_EXPORT WriterProperties {
 
   inline bool page_checksum_enabled() const { return page_checksum_enabled_; }
 
-  inline bool write_page_index() const { return write_page_index_; }
-
   inline Encoding::type dictionary_index_encoding() const {
     if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
       return Encoding::PLAIN_DICTIONARY;
@@ -640,6 +684,8 @@ class PARQUET_EXPORT WriterProperties {
     return column_properties(path).dictionary_enabled();
   }
 
+  const std::vector<SortingColumn>& sorting_columns() const { return sorting_columns_; }
+
   bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
     return column_properties(path).statistics_enabled();
   }
@@ -648,6 +694,22 @@ class PARQUET_EXPORT WriterProperties {
     return column_properties(path).max_statistics_size();
   }
 
+  bool page_index_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+    return column_properties(path).page_index_enabled();
+  }
+
+  bool page_index_enabled() const {
+    if (default_column_properties_.page_index_enabled()) {
+      return true;
+    }
+    for (const auto& item : column_properties_) {
+      if (item.second.page_index_enabled()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   inline FileEncryptionProperties* file_encryption_properties() const {
     return file_encryption_properties_.get();
   }
@@ -670,7 +732,7 @@ class PARQUET_EXPORT WriterProperties {
       const ColumnProperties& default_column_properties,
       const std::unordered_map<std::string, ColumnProperties>& column_properties,
       ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer,
-      bool write_page_index)
+      std::vector<SortingColumn> sorting_columns)
       : pool_(pool),
         dictionary_pagesize_limit_(dictionary_pagesize_limit),
         write_batch_size_(write_batch_size),
@@ -681,8 +743,8 @@ class PARQUET_EXPORT WriterProperties {
         parquet_created_by_(created_by),
         store_decimal_as_integer_(store_short_decimal_as_integer),
         page_checksum_enabled_(page_write_checksum_enabled),
-        write_page_index_(write_page_index),
         file_encryption_properties_(file_encryption_properties),
+        sorting_columns_(std::move(sorting_columns)),
         default_column_properties_(default_column_properties),
         column_properties_(column_properties) {}
 
@@ -696,10 +758,11 @@ class PARQUET_EXPORT WriterProperties {
   std::string parquet_created_by_;
   bool store_decimal_as_integer_;
   bool page_checksum_enabled_;
-  bool write_page_index_;
 
   std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
 
+  std::vector<SortingColumn> sorting_columns_;
+
   ColumnProperties default_column_properties_;
   std::unordered_map<std::string, ColumnProperties> column_properties_;
 };
@@ -754,11 +817,14 @@ class PARQUET_EXPORT ArrowReaderProperties {
     }
   }
 
-  /// \brief Set the maximum number of rows to read into a chunk or record batch.
+  /// \brief Set the maximum number of rows to read into a record batch.
   ///
   /// Will only be fewer rows when there are no more rows in the file.
+  /// Note that some APIs such as ReadTable may ignore this setting.
   void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
-  /// Return the batch size.
+  /// Return the batch size in rows.
+  ///
+  /// Note that some APIs such as ReadTable may ignore this setting.
   int64_t batch_size() const { return batch_size_; }
 
   /// Enable read coalescing (default false).
@@ -819,8 +885,7 @@ class PARQUET_EXPORT ArrowWriterProperties {
           coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
           truncated_timestamps_allowed_(false),
           store_schema_(false),
-          // TODO: At some point we should flip this.
-          compliant_nested_types_(false),
+          compliant_nested_types_(true),
           engine_version_(V2),
           use_threads_(kArrowDefaultUseThreads),
           executor_(NULLPTR) {}
@@ -875,16 +940,16 @@ class PARQUET_EXPORT ArrowWriterProperties {
     /// \brief When enabled, will not preserve Arrow field names for list types.
     ///
     /// Instead of using the field names Arrow uses for the values array of
-    /// list types (default "item"), will use "entries", as is specified in
+    /// list types (default "item"), will use "element", as is specified in
     /// the Parquet spec.
     ///
-    /// This is disabled by default, but will be enabled by default in future.
+    /// This is enabled by default.
     Builder* enable_compliant_nested_types() {
       compliant_nested_types_ = true;
       return this;
     }
 
-    /// Preserve Arrow list field name (default behavior).
+    /// Preserve Arrow list field name.
     Builder* disable_compliant_nested_types() {
       compliant_nested_types_ = false;
       return this;
diff --git a/include/parquet/statistics.h b/include/parquet/statistics.h
index 3f168a9..ae6c1ca 100644
--- a/include/parquet/statistics.h
+++ b/include/parquet/statistics.h
@@ -117,17 +117,16 @@ std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* d
 // ----------------------------------------------------------------------
 
 /// \brief Structure represented encoded statistics to be written to
-/// and from Parquet serialized metadata
+/// and read from Parquet serialized metadata.
 class PARQUET_EXPORT EncodedStatistics {
-  std::shared_ptr<std::string> max_, min_;
+  std::string max_, min_;
   bool is_signed_ = false;
 
  public:
-  EncodedStatistics()
-      : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
+  EncodedStatistics() = default;
 
-  const std::string& max() const { return *max_; }
-  const std::string& min() const { return *min_; }
+  const std::string& max() const { return max_; }
+  const std::string& min() const { return min_; }
 
   int64_t null_count = 0;
   int64_t distinct_count = 0;
@@ -149,11 +148,13 @@ class PARQUET_EXPORT EncodedStatistics {
   // the true minimum for aggregations and there is no way to mark that a
   // value has been truncated and is a lower bound and not in the page.
   void ApplyStatSizeLimits(size_t length) {
-    if (max_->length() > length) {
+    if (max_.length() > length) {
       has_max = false;
+      max_.clear();
     }
-    if (min_->length() > length) {
+    if (min_.length() > length) {
       has_min = false;
+      min_.clear();
     }
   }
 
@@ -165,14 +166,14 @@ class PARQUET_EXPORT EncodedStatistics {
 
   void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
 
-  EncodedStatistics& set_max(const std::string& value) {
-    *max_ = value;
+  EncodedStatistics& set_max(std::string value) {
+    max_ = std::move(value);
     has_max = true;
     return *this;
   }
 
-  EncodedStatistics& set_min(const std::string& value) {
-    *min_ = value;
+  EncodedStatistics& set_min(std::string value) {
+    min_ = std::move(value);
     has_min = true;
     return *this;
   }
@@ -329,7 +330,7 @@ class TypedStatistics : public Statistics {
   /// null count is determined from the indices)
   virtual void IncrementNullCount(int64_t n) = 0;
 
-  /// \brief Increments the number ov values directly
+  /// \brief Increments the number of values directly
   /// The same note on IncrementNullCount applies here
   virtual void IncrementNumValues(int64_t n) = 0;
 };
diff --git a/include/parquet/test_util.h b/include/parquet/test_util.h
index a1f6a74..dfb4b5d 100644
--- a/include/parquet/test_util.h
+++ b/include/parquet/test_util.h
@@ -744,5 +744,53 @@ inline void PrimitiveTypedTest<BooleanType>::GenerateData(int64_t num_values,
   std::fill(def_levels_.begin(), def_levels_.end(), 1);
 }
 
+// ----------------------------------------------------------------------
+// test data generation
+
+template <typename T>
+inline void GenerateData(int num_values, T* out, std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  random_numbers(num_values, 0, std::numeric_limits<T>::min(),
+                 std::numeric_limits<T>::max(), out);
+}
+
+template <typename T>
+inline void GenerateBoundData(int num_values, T* out, T min, T max,
+                              std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  random_numbers(num_values, 0, min, max, out);
+}
+
+template <>
+inline void GenerateData<bool>(int num_values, bool* out, std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  random_bools(num_values, 0.5, 0, out);
+}
+
+template <>
+inline void GenerateData<Int96>(int num_values, Int96* out, std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
+                       std::numeric_limits<int32_t>::max(), out);
+}
+
+template <>
+inline void GenerateData<ByteArray>(int num_values, ByteArray* out,
+                                    std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  int max_byte_array_len = 12;
+  heap->resize(num_values * max_byte_array_len);
+  random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len);
+}
+
+static constexpr int kGenerateDataFLBALength = 8;
+
+template <>
+inline void GenerateData<FLBA>(int num_values, FLBA* out, std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  heap->resize(num_values * kGenerateDataFLBALength);
+  random_fixed_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out);
+}
+
 }  // namespace test
 }  // namespace parquet
diff --git a/include/parquet/types.h b/include/parquet/types.h
index 6ec6870..d4d6a73 100644
--- a/include/parquet/types.h
+++ b/include/parquet/types.h
@@ -517,6 +517,8 @@ struct PageType {
   };
 };
 
+bool PageCanUseChecksum(PageType::type pageType);
+
 class ColumnOrder {
  public:
   enum type { UNDEFINED, TYPE_DEFINED_ORDER };
@@ -543,6 +545,27 @@ struct BoundaryOrder {
   };
 };
 
+/// \brief SortingColumn is a proxy around format::SortingColumn.
+struct PARQUET_EXPORT SortingColumn {
+  // The column index (in this row group)
+  int32_t column_idx;
+
+  // If true, indicates this column is sorted in descending order.
+  bool descending;
+
+  // If true, nulls will come before non-null values, otherwise, nulls go at the end.
+  bool nulls_first;
+};
+
+inline bool operator==(const SortingColumn& left, const SortingColumn& right) {
+  return left.nulls_first == right.nulls_first && left.descending == right.descending &&
+         left.column_idx == right.column_idx;
+}
+
+inline bool operator!=(const SortingColumn& left, const SortingColumn& right) {
+  return !(left == right);
+}
+
 // ----------------------------------------------------------------------
 
 struct ByteArray {
diff --git a/include/parquet/xxhasher.h b/include/parquet/xxhasher.h
index 4a21f14..a54f287 100644
--- a/include/parquet/xxhasher.h
+++ b/include/parquet/xxhasher.h
@@ -35,6 +35,15 @@ class PARQUET_EXPORT XxHasher : public Hasher {
   uint64_t Hash(const ByteArray* value) const override;
   uint64_t Hash(const FLBA* val, uint32_t len) const override;
 
+  void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const float* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const double* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const Int96* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const ByteArray* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const FLBA* values, uint32_t type_len, int num_values,
+              uint64_t* hashes) const override;
+
   static constexpr int kParquetBloomXxHashSeed = 0;
 };
 
diff --git a/lib/i386/libarrow.a b/lib/i386/libarrow.a
index 4d081ad..6bcbb81 100644
Binary files a/lib/i386/libarrow.a and b/lib/i386/libarrow.a differ
diff --git a/lib/i386/libarrow_acero.a b/lib/i386/libarrow_acero.a
index 5baef85..cf83524 100644
Binary files a/lib/i386/libarrow_acero.a and b/lib/i386/libarrow_acero.a differ
diff --git a/lib/i386/libarrow_bundled_dependencies.a b/lib/i386/libarrow_bundled_dependencies.a
index 93469c1..ec1d44f 100644
Binary files a/lib/i386/libarrow_bundled_dependencies.a and b/lib/i386/libarrow_bundled_dependencies.a differ
diff --git a/lib/i386/libarrow_dataset.a b/lib/i386/libarrow_dataset.a
index 9fe104f..524bf92 100644
Binary files a/lib/i386/libarrow_dataset.a and b/lib/i386/libarrow_dataset.a differ
diff --git a/lib/i386/libcrypto.a b/lib/i386/libcrypto.a
index 94bee69..6b59041 100644
Binary files a/lib/i386/libcrypto.a and b/lib/i386/libcrypto.a differ
diff --git a/lib/i386/libcurl.a b/lib/i386/libcurl.a
index df0d2dd..13336a2 100644
Binary files a/lib/i386/libcurl.a and b/lib/i386/libcurl.a differ
diff --git a/lib/i386/libparquet.a b/lib/i386/libparquet.a
index 7834e6f..8898be4 100644
Binary files a/lib/i386/libparquet.a and b/lib/i386/libparquet.a differ
diff --git a/lib/i386/libssh2.a b/lib/i386/libssh2.a
index 589a450..d868da3 100644
Binary files a/lib/i386/libssh2.a and b/lib/i386/libssh2.a differ
diff --git a/lib/i386/libssl.a b/lib/i386/libssl.a
index 5327724..f81fce2 100644
Binary files a/lib/i386/libssl.a and b/lib/i386/libssl.a differ
diff --git a/lib/x64-ucrt/libarrow.a b/lib/x64-ucrt/libarrow.a
index aec6f45..a5416ae 100644
Binary files a/lib/x64-ucrt/libarrow.a and b/lib/x64-ucrt/libarrow.a differ
diff --git a/lib/x64-ucrt/libarrow_acero.a b/lib/x64-ucrt/libarrow_acero.a
index 39a257c..db4352c 100644
Binary files a/lib/x64-ucrt/libarrow_acero.a and b/lib/x64-ucrt/libarrow_acero.a differ
diff --git a/lib/x64-ucrt/libarrow_bundled_dependencies.a b/lib/x64-ucrt/libarrow_bundled_dependencies.a
index f17d6a0..1c18565 100644
Binary files a/lib/x64-ucrt/libarrow_bundled_dependencies.a and b/lib/x64-ucrt/libarrow_bundled_dependencies.a differ
diff --git a/lib/x64-ucrt/libarrow_dataset.a b/lib/x64-ucrt/libarrow_dataset.a
index 4c1194a..acee1ff 100644
Binary files a/lib/x64-ucrt/libarrow_dataset.a and b/lib/x64-ucrt/libarrow_dataset.a differ
diff --git a/lib/x64-ucrt/libcrypto.a b/lib/x64-ucrt/libcrypto.a
index 4ae8f68..a3f89ba 100644
Binary files a/lib/x64-ucrt/libcrypto.a and b/lib/x64-ucrt/libcrypto.a differ
diff --git a/lib/x64-ucrt/libcurl.a b/lib/x64-ucrt/libcurl.a
index 32633d8..139e1d1 100644
Binary files a/lib/x64-ucrt/libcurl.a and b/lib/x64-ucrt/libcurl.a differ
diff --git a/lib/x64-ucrt/libparquet.a b/lib/x64-ucrt/libparquet.a
index f9de22b..d413838 100644
Binary files a/lib/x64-ucrt/libparquet.a and b/lib/x64-ucrt/libparquet.a differ
diff --git a/lib/x64-ucrt/libssh2.a b/lib/x64-ucrt/libssh2.a
index fa3f232..30d9b1c 100644
Binary files a/lib/x64-ucrt/libssh2.a and b/lib/x64-ucrt/libssh2.a differ
diff --git a/lib/x64-ucrt/libssl.a b/lib/x64-ucrt/libssl.a
index b92953d..f451af1 100644
Binary files a/lib/x64-ucrt/libssl.a and b/lib/x64-ucrt/libssl.a differ
diff --git a/lib/x64/libarrow.a b/lib/x64/libarrow.a
index 4b492a6..7783a59 100644
Binary files a/lib/x64/libarrow.a and b/lib/x64/libarrow.a differ
diff --git a/lib/x64/libarrow_acero.a b/lib/x64/libarrow_acero.a
index 1008ace..16fb896 100644
Binary files a/lib/x64/libarrow_acero.a and b/lib/x64/libarrow_acero.a differ
diff --git a/lib/x64/libarrow_bundled_dependencies.a b/lib/x64/libarrow_bundled_dependencies.a
index acc51a8..eb0c809 100644
Binary files a/lib/x64/libarrow_bundled_dependencies.a and b/lib/x64/libarrow_bundled_dependencies.a differ
diff --git a/lib/x64/libarrow_dataset.a b/lib/x64/libarrow_dataset.a
index 8f7c75b..bb0d3a6 100644
Binary files a/lib/x64/libarrow_dataset.a and b/lib/x64/libarrow_dataset.a differ
diff --git a/lib/x64/libcrypto.a b/lib/x64/libcrypto.a
index b3a7bf1..e521bd9 100644
Binary files a/lib/x64/libcrypto.a and b/lib/x64/libcrypto.a differ
diff --git a/lib/x64/libcurl.a b/lib/x64/libcurl.a
index 461d9a5..ccb3387 100644
Binary files a/lib/x64/libcurl.a and b/lib/x64/libcurl.a differ
diff --git a/lib/x64/libparquet.a b/lib/x64/libparquet.a
index ec6dd36..2f08277 100644
Binary files a/lib/x64/libparquet.a and b/lib/x64/libparquet.a differ
diff --git a/lib/x64/libssh2.a b/lib/x64/libssh2.a
index 8c516b8..1e3da48 100644
Binary files a/lib/x64/libssh2.a and b/lib/x64/libssh2.a differ
diff --git a/lib/x64/libssl.a b/lib/x64/libssl.a
index 6f3e273..31f3a0f 100644
Binary files a/lib/x64/libssl.a and b/lib/x64/libssl.a differ