From 503ed90303623cb5f42697b50ccf1fe73298e46c Mon Sep 17 00:00:00 2001
From: Francesc Alted <faltet@gmail.com>
Date: Sun, 17 Apr 2016 12:14:34 +0200
Subject: [PATCH] Internal C-Blosc bumped to 1.8.1

---
 MANIFEST.in                                   |    2 +-
 c-blosc/.editorconfig                         |   20 +
 c-blosc/.gitignore                            |    1 +
 c-blosc/.mailmap                              |    8 +-
 c-blosc/.travis.yml                           |   21 +-
 c-blosc/ANNOUNCE.rst                          |   34 +-
 c-blosc/CMakeLists.txt                        |  161 +-
 c-blosc/LICENSES/BITSHUFFLE.txt               |   21 +
 c-blosc/LICENSES/BLOSC.txt                    |    4 +-
 c-blosc/LICENSES/LZ4.txt                      |    2 +-
 c-blosc/LICENSES/STDINT.txt                   |   12 +-
 c-blosc/README.rst                            |  206 ++-
 c-blosc/README_HEADER.rst                     |    6 +-
 c-blosc/RELEASE_NOTES.rst                     |  262 ++-
 c-blosc/RELEASING.rst                         |   13 +-
 c-blosc/THANKS.rst                            |   13 +-
 c-blosc/appveyor.yml                          |   37 +
 c-blosc/bench/CMakeLists.txt                  |  105 +-
 c-blosc/bench/Makefile                        |    2 +-
 c-blosc/bench/Makefile.mingw                  |    2 +-
 c-blosc/bench/bench.c                         |  324 ++--
 c-blosc/bench/plot-speeds.py                  |    5 +-
 c-blosc/blosc/CMakeLists.txt                  |   88 +-
 c-blosc/blosc/bitshuffle-avx2.c               |  248 +++
 c-blosc/blosc/bitshuffle-avx2.h               |   38 +
 c-blosc/blosc/bitshuffle-generic.c            |  197 +++
 c-blosc/blosc/bitshuffle-generic.h            |  151 ++
 c-blosc/blosc/bitshuffle-sse2.c               |  467 +++++
 c-blosc/blosc/bitshuffle-sse2.h               |   52 +
 c-blosc/blosc/blosc-export.h                  |   45 +
 c-blosc/blosc/blosc.c                         | 1418 +++++++--------
 c-blosc/blosc/blosc.h                         |  167 +-
 c-blosc/blosc/blosclz.c                       |  189 +-
 c-blosc/blosc/blosclz.h                       |   13 +-
 c-blosc/blosc/config.h.in                     |    1 +
 c-blosc/blosc/shuffle-avx2.c                  |  757 ++++++++
 c-blosc/blosc/shuffle-avx2.h                  |   36 +
 c-blosc/blosc/shuffle-common.h                |   34 +
 c-blosc/blosc/shuffle-generic.c               |   25 +
 c-blosc/blosc/shuffle-generic.h               |   99 ++
 c-blosc/blosc/shuffle-sse2.c                  |  626 +++++++
 c-blosc/blosc/shuffle-sse2.h                  |   36 +
 c-blosc/blosc/shuffle.c                       |  823 ++++-----
 c-blosc/blosc/shuffle.h                       |   65 +-
 c-blosc/cmake/FindSSE.cmake                   |  125 --
 c-blosc/examples/multithread.c                |   12 +-
 c-blosc/examples/simple.c                     |    4 +-
 c-blosc/examples/win-dynamic-linking.c        |  128 ++
 c-blosc/hdf5/CMakeLists.txt                   |   38 -
 c-blosc/hdf5/README.rst                       |   62 -
 c-blosc/internal-complibs/lz4-1.7.2/lz4.c     | 1564 ++++++++++++++++
 c-blosc/internal-complibs/lz4-1.7.2/lz4.h     |  360 ++++
 c-blosc/internal-complibs/lz4-1.7.2/lz4hc.c   |  748 ++++++++
 c-blosc/internal-complibs/lz4-1.7.2/lz4hc.h   |  189 ++
 c-blosc/internal-complibs/lz4-r119/lz4.c      | 1247 -------------
 c-blosc/internal-complibs/lz4-r119/lz4.h      |  306 ----
 c-blosc/internal-complibs/lz4-r119/lz4hc.c    |  892 ----------
 c-blosc/internal-complibs/lz4-r119/lz4hc.h    |  173 --
 c-blosc/scripts/travis-before-install.sh      |   16 +
 c-blosc/tests/CMakeLists.txt                  |   86 +-
 c-blosc/tests/Makefile                        |    2 +-
 c-blosc/tests/gcc-segfault-issue.c            |   80 +
 c-blosc/tests/test_all.sh                     |    4 +-
 c-blosc/tests/test_api.c                      |   19 +-
 c-blosc/tests/test_basics.c                   |  207 ---
 c-blosc/tests/test_common.h                   |  111 +-
 c-blosc/tests/test_compress_roundtrip.c       |  134 ++
 c-blosc/tests/test_compress_roundtrip.csv     |  267 +++
 c-blosc/tests/test_getitem.c                  |  130 ++
 c-blosc/tests/test_getitem.csv                |  400 +++++
 c-blosc/tests/test_maxout.c                   |  117 ++
 c-blosc/tests/test_shuffle_roundtrip_avx2.c   |  137 ++
 c-blosc/tests/test_shuffle_roundtrip_avx2.csv |  400 +++++
 .../tests/test_shuffle_roundtrip_generic.c    |   93 +
 .../tests/test_shuffle_roundtrip_generic.csv  |  134 ++
 c-blosc/tests/test_shuffle_roundtrip_sse2.c   |  138 ++
 c-blosc/tests/test_shuffle_roundtrip_sse2.csv |  400 +++++
 cpuinfo.py                                    | 1565 +++++++++++++++++
 hdf5-blosc/.gitignore                         |   36 +
 hdf5-blosc/.travis.yml                        |   22 +
 hdf5-blosc/CMakeLists.txt                     |   71 +
 hdf5-blosc/LICENSES/BLOSC.txt                 |   21 +
 hdf5-blosc/LICENSES/BLOSC_HDF5.txt            |   21 +
 {c-blosc => hdf5-blosc}/LICENSES/H5PY.txt     |    0
 hdf5-blosc/README.rst                         |   69 +
 .../hdf5 => hdf5-blosc/src}/blosc_filter.c    |   14 +-
 .../hdf5 => hdf5-blosc/src}/blosc_filter.h    |    0
 .../hdf5 => hdf5-blosc/src}/blosc_plugin.c    |    0
 .../hdf5 => hdf5-blosc/src}/blosc_plugin.h    |    0
 {c-blosc/hdf5 => hdf5-blosc/src}/example.c    |    9 +-
 hdf5-blosc/travis-before-install.sh           |   16 +
 setup.py                                      |   72 +-
 92 files changed, 12703 insertions(+), 4772 deletions(-)
 create mode 100644 c-blosc/.editorconfig
 create mode 100644 c-blosc/LICENSES/BITSHUFFLE.txt
 create mode 100644 c-blosc/appveyor.yml
 create mode 100644 c-blosc/blosc/bitshuffle-avx2.c
 create mode 100644 c-blosc/blosc/bitshuffle-avx2.h
 create mode 100644 c-blosc/blosc/bitshuffle-generic.c
 create mode 100644 c-blosc/blosc/bitshuffle-generic.h
 create mode 100644 c-blosc/blosc/bitshuffle-sse2.c
 create mode 100644 c-blosc/blosc/bitshuffle-sse2.h
 create mode 100644 c-blosc/blosc/blosc-export.h
 create mode 100644 c-blosc/blosc/shuffle-avx2.c
 create mode 100644 c-blosc/blosc/shuffle-avx2.h
 create mode 100644 c-blosc/blosc/shuffle-common.h
 create mode 100644 c-blosc/blosc/shuffle-generic.c
 create mode 100644 c-blosc/blosc/shuffle-generic.h
 create mode 100644 c-blosc/blosc/shuffle-sse2.c
 create mode 100644 c-blosc/blosc/shuffle-sse2.h
 delete mode 100644 c-blosc/cmake/FindSSE.cmake
 create mode 100644 c-blosc/examples/win-dynamic-linking.c
 delete mode 100644 c-blosc/hdf5/CMakeLists.txt
 delete mode 100644 c-blosc/hdf5/README.rst
 create mode 100644 c-blosc/internal-complibs/lz4-1.7.2/lz4.c
 create mode 100644 c-blosc/internal-complibs/lz4-1.7.2/lz4.h
 create mode 100644 c-blosc/internal-complibs/lz4-1.7.2/lz4hc.c
 create mode 100644 c-blosc/internal-complibs/lz4-1.7.2/lz4hc.h
 delete mode 100644 c-blosc/internal-complibs/lz4-r119/lz4.c
 delete mode 100644 c-blosc/internal-complibs/lz4-r119/lz4.h
 delete mode 100644 c-blosc/internal-complibs/lz4-r119/lz4hc.c
 delete mode 100644 c-blosc/internal-complibs/lz4-r119/lz4hc.h
 create mode 100755 c-blosc/scripts/travis-before-install.sh
 create mode 100644 c-blosc/tests/gcc-segfault-issue.c
 delete mode 100644 c-blosc/tests/test_basics.c
 create mode 100644 c-blosc/tests/test_compress_roundtrip.c
 create mode 100644 c-blosc/tests/test_compress_roundtrip.csv
 create mode 100644 c-blosc/tests/test_getitem.c
 create mode 100644 c-blosc/tests/test_getitem.csv
 create mode 100644 c-blosc/tests/test_maxout.c
 create mode 100644 c-blosc/tests/test_shuffle_roundtrip_avx2.c
 create mode 100644 c-blosc/tests/test_shuffle_roundtrip_avx2.csv
 create mode 100644 c-blosc/tests/test_shuffle_roundtrip_generic.c
 create mode 100644 c-blosc/tests/test_shuffle_roundtrip_generic.csv
 create mode 100644 c-blosc/tests/test_shuffle_roundtrip_sse2.c
 create mode 100644 c-blosc/tests/test_shuffle_roundtrip_sse2.csv
 create mode 100644 cpuinfo.py
 create mode 100644 hdf5-blosc/.gitignore
 create mode 100644 hdf5-blosc/.travis.yml
 create mode 100644 hdf5-blosc/CMakeLists.txt
 create mode 100644 hdf5-blosc/LICENSES/BLOSC.txt
 create mode 100644 hdf5-blosc/LICENSES/BLOSC_HDF5.txt
 rename {c-blosc => hdf5-blosc}/LICENSES/H5PY.txt (100%)
 create mode 100644 hdf5-blosc/README.rst
 rename {c-blosc/hdf5 => hdf5-blosc/src}/blosc_filter.c (96%)
 rename {c-blosc/hdf5 => hdf5-blosc/src}/blosc_filter.h (100%)
 rename {c-blosc/hdf5 => hdf5-blosc/src}/blosc_plugin.c (100%)
 rename {c-blosc/hdf5 => hdf5-blosc/src}/blosc_plugin.h (100%)
 rename {c-blosc/hdf5 => hdf5-blosc/src}/example.c (93%)
 create mode 100755 hdf5-blosc/travis-before-install.sh

diff --git a/MANIFEST.in b/MANIFEST.in
index 0993879d4..4b72abf6b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -7,7 +7,7 @@ recursive-include tables/tests *.h5 *.mat
 recursive-include tables/nodes/tests *.h5 *.dat *.xbm
 recursive-include src *.c *.h Makefile
 
-include c-blosc/hdf5/blosc_filter.?
+include hdf5-hdf5/src/blosc_filter.?
 recursive-include c-blosc/blosc *.c *.h
 recursive-include c-blosc/internal-complibs *.c *.cc *.h
 
diff --git a/c-blosc/.editorconfig b/c-blosc/.editorconfig
new file mode 100644
index 000000000..17ad9d19a
--- /dev/null
+++ b/c-blosc/.editorconfig
@@ -0,0 +1,20 @@
+; Top-most EditorConfig file
+root = true
+
+; Global settings
+[*]
+end_of_line = LF
+indent_style = space
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+; C source files
+[*.{h,c}]
+indent_size = 2
+
+; CMake
+[CMakeLists.txt]
+indent_size = 4
+
+[*.cmake]
+indent_size = 4
diff --git a/c-blosc/.gitignore b/c-blosc/.gitignore
index faf2156b4..1596d0164 100644
--- a/c-blosc/.gitignore
+++ b/c-blosc/.gitignore
@@ -1 +1,2 @@
 bench/bench
+build/
diff --git a/c-blosc/.mailmap b/c-blosc/.mailmap
index 2174cede8..554046d0e 100644
--- a/c-blosc/.mailmap
+++ b/c-blosc/.mailmap
@@ -1,4 +1,4 @@
-Francesc Alted <francesc@continuum.io> FrancescAlted <francesc@blosc.io>
-Francesc Alted <francesc@continuum.io> FrancescAlted <francesc@continuum.io>
-Francesc Alted <francesc@continuum.io> FrancescAlted <faltet@pytables.org>
-
+Francesc Alted <francesc@blosc.org> FrancescAlted <francesc@blosc.io>
+Francesc Alted <francesc@blosc.org> FrancescAlted <francesc@continuum.io>
+Francesc Alted <francesc@blosc.org> FrancescAlted <faltet@pytables.org>
+Francesc Alted <francesc@blosc.org> FrancescAlted <faltet@blosc.org>
diff --git a/c-blosc/.travis.yml b/c-blosc/.travis.yml
index 5ba27cde0..0887c5d49 100644
--- a/c-blosc/.travis.yml
+++ b/c-blosc/.travis.yml
@@ -1,12 +1,23 @@
 language: c
+
+os:
+- linux
+- osx
+
 compiler:
   - gcc
   - clang
-install: sudo apt-get install libhdf5-serial-dev
-#install: sudo apt-get install libsnappy-dev zlib1g-dev libhdf5-serial-dev
-#install: sudo apt-get install liblz4-dev libsnappy-dev zlib1g-dev libhdf5-dev
+
+before_install: ./scripts/travis-before-install.sh
+
+#install: sudo apt-get install libsnappy-dev zlib1g-dev
+#install: sudo apt-get install liblz4-dev libsnappy-dev zlib1g-dev
+
 before_script:
   - mkdir build
   - cd build
-  - cmake -DBUILD_HDF5_FILTER=TRUE ..
-script: make && make test
+  - cmake ..
+
+script:
+  - cmake --build . --config Release
+  - ctest
diff --git a/c-blosc/ANNOUNCE.rst b/c-blosc/ANNOUNCE.rst
index f89b32f07..fe73474b8 100644
--- a/c-blosc/ANNOUNCE.rst
+++ b/c-blosc/ANNOUNCE.rst
@@ -1,38 +1,32 @@
 ===============================================================
- Announcing c-blosc 1.4.4
- A blocking, shuffling and lossless compression library
+ Announcing c-blosc 1.8.1
+ A blocking, shuffling and lossless compression library for C
 ===============================================================
 
 What is new?
 ============
 
-* New computation of blocksize to be in sync with c-blosc 1.6.1.
-
-* New parametrization of the hash table for blosclz (synced with c-blosc
-  1.6.1)
+This is a patch release for disabling the use of
+__builtin_cpu_supports() call for comaptibilty with GCC 5.3.1 (the one
+in forthcoming Ubuntu/Xenial).  Details in:
 
 For more info, please see the release notes in:
 
-https://github.com/Blosc/c-blosc/wiki/Release-notes
+https://github.com/Blosc/c-blosc/blob/master/RELEASE_NOTES.rst
 
 
 What is it?
 ===========
 
-Blosc (http://www.blosc.org) is a high performance compressor
+Blosc (http://www.blosc.org) is a high performance meta-compressor
 optimized for binary data.  It has been designed to transmit data to
 the processor cache faster than the traditional, non-compressed,
 direct memory fetch approach via a memcpy() OS call.
 
-Blosc is the first compressor (that I'm aware of) that is meant not
-only to reduce the size of large datasets on-disk or in-memory, but
-also to accelerate object manipulations that are memory-bound.
-
-Blosc has a Python wrapper called python-blosc
-(https://github.com/Blosc/python-blosc) with a high-performance
-interface to NumPy too.  There is also a handy command line for Blosc
-called Bloscpack (https://github.com/Blosc/bloscpack) that allows you to
-compress large binary datafiles on-disk.
+Blosc has internal support for different compressors like its internal
+BloscLZ, but also LZ4, LZ4HC, Snappy and Zlib.  This way these can
+automatically leverage the multithreading and pre-filtering
+(shuffling) capabilities that comes with Blosc.
 
 
 Download sources
@@ -61,9 +55,3 @@ http://groups.google.es/group/blosc
 
 Enjoy Data!
 
-
-.. Local Variables:
-.. mode: rst
-.. coding: utf-8
-.. fill-column: 70
-.. End:
diff --git a/c-blosc/CMakeLists.txt b/c-blosc/CMakeLists.txt
index 8f5283b82..20a3ceae6 100644
--- a/c-blosc/CMakeLists.txt
+++ b/c-blosc/CMakeLists.txt
@@ -5,8 +5,6 @@
 #
 #   BUILD_STATIC: default ON
 #       build the static version of the Blosc library
-#   BUILD_HDF5_FILTER: default OFF
-#       build the compression filter for the HDF5 library
 #   BUILD_TESTS: default ON
 #       build test programs and generates the "test" target
 #   BUILD_BENCHMARKS: default ON
@@ -17,14 +15,27 @@
 #       do not include support for the Snappy library
 #   DEACTIVATE_ZLIB: default OFF
 #       do not include support for the Zlib library
-#   PREFER_EXTERNAL_COMPLIBS: default ON
-#       when found, use the installed compression libs instead of included sources
-#   TEST_INCLUDE_BENCH_SINGLE_1: default ON
-#       add a test that runs the benchmark program passing "single" with 1 thread
-#       as first parameter
-#   TEST_INCLUDE_BENCH_SINGLE_N: default ON
-#       add a test that runs the benchmark program passing "single" with all threads
-#       as first parameter
+#   PREFER_EXTERNAL_LZ4: default OFF
+#       when found, use the installed LZ4 libs instead of included
+#       sources
+#   PREFER_EXTERNAL_SNAPPY: default ON
+#       when found, use the installed Snappy libs instead of included
+#       sources
+#   PREFER_EXTERNAL_ZLIB: default ON
+#       when found, use the installed zlib libs instead of included
+#       sources
+#   TEST_INCLUDE_BENCH_SHUFFLE_1: default ON
+#       add a test that runs the benchmark program passing "shuffle" with 1
+#       thread as second parameter
+#   TEST_INCLUDE_BENCH_SHUFFLE_N: default ON
+#       add a test that runs the benchmark program passing "shuffle" with all
+#       threads as second parameter
+#   TEST_INCLUDE_BENCH_BITSHUFFLE_1: default ON
+#       add a test that runs the benchmark program passing "bitshuffle" with 1
+#       thread as second parameter
+#   TEST_INCLUDE_BENCH_BITSHUFFLE_N: default ON
+#       add a test that runs the benchmark program passing "bitshuffle" with
+#       all threads as second parameter
 #   TEST_INCLUDE_BENCH_SUITE: default OFF
 #       add a test that runs the benchmark program passing "suite"
 #       as first parameter
@@ -45,11 +56,9 @@
 #
 #    LIB: includes blosc.so
 #    DEV: static includes blosc.a and blosc.h
-#    HDF5_FILTER: includes blosc_filter.so
-#    HDF5_FILTER_DEV: includes blosc_filter.h
 
 
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 2.8.10)
 project(blosc)
 
 # parse the full version numbers from blosc.h
@@ -68,8 +77,6 @@ message("Configuring for Blosc version: " ${BLOSC_VERSION_STRING})
 # options
 option(BUILD_STATIC
     "Build a static version of the blosc library." ON)
-option(BUILD_HDF5_FILTER
-    "Build a blosc based compression filter for the HDF5 library" OFF)
 option(BUILD_TESTS
     "Build test programs form the blosc compression library" ON)
 option(BUILD_BENCHMARKS
@@ -80,30 +87,33 @@ option(DEACTIVATE_SNAPPY
     "Do not include support for the SNAPPY library." OFF)
 option(DEACTIVATE_ZLIB
     "Do not include support for the ZLIB library." OFF)
-option(PREFER_EXTERNAL_COMPLIBS
-    "When found, use the installed compression libs instead of included sources." ON)
+option(PREFER_EXTERNAL_LZ4
+    "Find and use external LZ4 library instead of included sources." OFF)
+option(PREFER_EXTERNAL_SNAPPY
+    "Find and use external Snappy library instead of included sources." ON)
+option(PREFER_EXTERNAL_ZLIB
+    "Find and use external zlib library instead of included sources." ON)
 
 set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 
 
-if(NOT PREFER_EXTERNAL_COMPLIBS)
-    message(STATUS "Finding external libraries disabled.  Using internal sources.")
-endif(NOT PREFER_EXTERNAL_COMPLIBS)
-
-
 if(NOT DEACTIVATE_LZ4)
-    if(PREFER_EXTERNAL_COMPLIBS)
+    if(PREFER_EXTERNAL_LZ4)
         find_package(LZ4)
-    endif(PREFER_EXTERNAL_COMPLIBS)
+    else()
+        message(STATUS "Using LZ4 internal sources.")
+    endif(PREFER_EXTERNAL_LZ4)
     # HAVE_LZ4 will be set to true because even if the library is
     # not found, we will use the included sources for it
     set(HAVE_LZ4 TRUE)
 endif(NOT DEACTIVATE_LZ4)
 
 if(NOT DEACTIVATE_SNAPPY)
-    if(PREFER_EXTERNAL_COMPLIBS)
+    if(PREFER_EXTERNAL_SNAPPY)
         find_package(Snappy)
-    endif(PREFER_EXTERNAL_COMPLIBS)
+    else()
+        message(STATUS "Using Snappy internal sources.")
+    endif(PREFER_EXTERNAL_SNAPPY)
     # HAVE_SNAPPY will be set to true because even if the library is not found,
     # we will use the included sources for it
     set(HAVE_SNAPPY TRUE)
@@ -111,13 +121,15 @@ endif(NOT DEACTIVATE_SNAPPY)
 
 if(NOT DEACTIVATE_ZLIB)
     # import the ZLIB_ROOT environment variable to help finding the zlib library
-    if(PREFER_EXTERNAL_COMPLIBS)
+    if(PREFER_EXTERNAL_ZLIB)
         set(ZLIB_ROOT $ENV{ZLIB_ROOT})
-        find_package( ZLIB )
+        find_package(ZLIB)
         if (NOT ZLIB_FOUND )
             message(STATUS "No zlib found.  Using internal sources.")
         endif (NOT ZLIB_FOUND )
-    endif(PREFER_EXTERNAL_COMPLIBS)
+    else()
+        message(STATUS "Using zlib internal sources.")
+    endif(PREFER_EXTERNAL_ZLIB)
     # HAVE_ZLIB will be set to true because even if the library is not found,
     # we will use the included sources for it
     set(HAVE_ZLIB TRUE)
@@ -125,16 +137,75 @@ endif(NOT DEACTIVATE_ZLIB)
 
 # create the config.h file
 configure_file ("blosc/config.h.in"  "blosc/config.h" )
+
 # now make sure that you set the build directory on your "Include" path when compiling
 include_directories("${PROJECT_BINARY_DIR}/blosc/")
 
-# force the default build type to Release.
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel."
-        FORCE)
-endif(NOT CMAKE_BUILD_TYPE)
+# If the build type is not set, default to Release.
+set(BLOSC_DEFAULT_BUILD_TYPE Release)
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    message(STATUS "No build type specified. Defaulting to '${BLOSC_DEFAULT_BUILD_TYPE}'.")
+    set(CMAKE_BUILD_TYPE ${BLOSC_DEFAULT_BUILD_TYPE} CACHE STRING
+        "Choose the type of build." FORCE)
+
+    # Set the possible values of build type for cmake-gui
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+        "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
 
+# Based on the target system's processor and the compiler being used,
+# set build variables indicating which hardware features can be targeted
+# by the compiler. Note we DO NOT check which hardware features are supported
+# by this (the host) system, because we want to be able to support compiling
+# for newer hardware on older machines as well as cross-compilation.
+message(STATUS "Building for system processor ${CMAKE_SYSTEM_PROCESSOR}")
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL i386 OR
+   CMAKE_SYSTEM_PROCESSOR STREQUAL i686 OR
+   CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR
+   CMAKE_SYSTEM_PROCESSOR STREQUAL amd64 OR
+   CMAKE_SYSTEM_PROCESSOR STREQUAL AMD64)
+    if(CMAKE_C_COMPILER_ID STREQUAL GNU)
+        set(COMPILER_SUPPORT_SSE2 TRUE)
+        if(CMAKE_C_COMPILER_VERSION VERSION_GREATER 4.7 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 4.7)
+            set(COMPILER_SUPPORT_AVX2 TRUE)
+        else()
+            set(COMPILER_SUPPORT_AVX2 FALSE)
+        endif()
+    elseif(CMAKE_C_COMPILER_ID STREQUAL Clang)
+        set(COMPILER_SUPPORT_SSE2 TRUE)
+        if(CMAKE_C_COMPILER_VERSION VERSION_GREATER 3.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 3.2)
+            set(COMPILER_SUPPORT_AVX2 TRUE)
+        else()
+            set(COMPILER_SUPPORT_AVX2 FALSE)
+        endif()
+    elseif(CMAKE_C_COMPILER_ID STREQUAL Intel)
+        set(COMPILER_SUPPORT_SSE2 TRUE)
+        if(CMAKE_C_COMPILER_VERSION VERSION_GREATER 14.0 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 14.0)
+	    # icc (ICC) 15.0.3 does not work compiling AVX2 code
+	    # (perhaps my machine does not have AVX2 and the compiler
+	    # cannot generate code for that?)
+            set(COMPILER_SUPPORT_AVX2 FALSE)
+        else()
+            set(COMPILER_SUPPORT_AVX2 FALSE)
+        endif()
+    elseif(MSVC)
+        set(COMPILER_SUPPORT_SSE2 TRUE)
+        if(CMAKE_C_COMPILER_VERSION VERSION_GREATER 18.00.30501 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 18.00.30501)
+            set(COMPILER_SUPPORT_AVX2 TRUE)
+        else()
+            set(COMPILER_SUPPORT_AVX2 FALSE)
+        endif()
+    else()
+        set(COMPILER_SUPPORT_SSE2 FALSE)
+        set(COMPILER_SUPPORT_AVX2 FALSE)
+        # Unrecognized compiler. Emit a warning message to let the user know hardware-acceleration won't be available.
+        message(WARNING "Unable to determine which ${CMAKE_SYSTEM_PROCESSOR} hardware features are supported by the C compiler (${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}).")
+    endif()
+else()
+    # If the target system processor isn't recognized, emit a warning message to alert the user
+    # that hardware-acceleration support won't be available but allow configuration to proceed.
+    message(WARNING "Unrecognized system processor ${CMAKE_SYSTEM_PROCESSOR}. Cannot determine which hardware features (${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}) supports, so hardware-accelerated implementations will not be available.")
+endif()
 
 # flags
 # @TODO: set -Wall
@@ -142,18 +213,22 @@ endif(NOT CMAKE_BUILD_TYPE)
 
 # Set the "-msse2" build flag only if the CMAKE_C_FLAGS is not already set.
 # Probably "-msse2" should be appended to CMAKE_C_FLAGS_RELEASE.
-find_package(SSE)
-if(CMAKE_C_COMPILER_ID STREQUAL GNU OR CMAKE_C_COMPILER_ID STREQUAL Clang)
-     if(NOT CMAKE_C_FLAGS AND SSE2_TRUE)
-         message(STATUS "SSE2 is here.  Adding support for it.")
+if(CMAKE_C_COMPILER_ID STREQUAL GNU OR CMAKE_C_COMPILER_ID STREQUAL Clang OR CMAKE_C_COMPILER_ID STREQUAL Intel)
+     if(NOT CMAKE_C_FLAGS AND COMPILER_SUPPORT_SSE2)
          set(CMAKE_C_FLAGS -msse2 CACHE STRING "C flags." FORCE)
-     endif(NOT CMAKE_C_FLAGS AND SSE2_TRUE)
-endif(CMAKE_C_COMPILER_ID STREQUAL GNU OR CMAKE_C_COMPILER_ID STREQUAL Clang)
+     endif(NOT CMAKE_C_FLAGS AND COMPILER_SUPPORT_SSE2)
+endif(CMAKE_C_COMPILER_ID STREQUAL GNU OR CMAKE_C_COMPILER_ID STREQUAL Clang OR CMAKE_C_COMPILER_ID STREQUAL Intel)
 
 if(MSVC)
     if(NOT CMAKE_C_FLAGS)
         set(CMAKE_C_FLAGS "/Ox" CACHE STRING "C flags." FORCE)
     endif(NOT CMAKE_C_FLAGS)
+
+    # Turn off misguided "secure CRT" warnings in MSVC.
+    # Microsoft wants people to use the MS-specific <function>_s
+    # versions of certain C functions but this is difficult to do
+    # in platform-independent code.
+    add_definitions( -D_CRT_SECURE_NO_WARNINGS )
 endif(MSVC)
 
 if(WIN32)
@@ -170,10 +245,6 @@ if(BUILD_TESTS)
     add_subdirectory(tests)
 endif(BUILD_TESTS)
 
-if(BUILD_HDF5_FILTER)
-    add_subdirectory(hdf5)
-endif(BUILD_HDF5_FILTER)
-
 if(BUILD_BENCHMARKS)
     add_subdirectory(bench)
 endif(BUILD_BENCHMARKS)
diff --git a/c-blosc/LICENSES/BITSHUFFLE.txt b/c-blosc/LICENSES/BITSHUFFLE.txt
new file mode 100644
index 000000000..1365ed69b
--- /dev/null
+++ b/c-blosc/LICENSES/BITSHUFFLE.txt
@@ -0,0 +1,21 @@
+Bitshuffle - Filter for improving compression of typed binary data.
+
+Copyright (c) 2014 Kiyoshi Masui (kiyo@physics.ubc.ca)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/c-blosc/LICENSES/BLOSC.txt b/c-blosc/LICENSES/BLOSC.txt
index 1226f9126..6956017a8 100644
--- a/c-blosc/LICENSES/BLOSC.txt
+++ b/c-blosc/LICENSES/BLOSC.txt
@@ -1,7 +1,6 @@
 Blosc - A blocking, shuffling and lossless compression library
 
-Copyright (C) 2009-2012 Francesc Alted <francesc@blosc.io>
-Copyright (C) 2013      Francesc Alted <francesc@blosc.io>
+Copyright (C) 2009-2016 Francesc Alted <francesc@blosc.org>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -20,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
-
diff --git a/c-blosc/LICENSES/LZ4.txt b/c-blosc/LICENSES/LZ4.txt
index 39784cb5a..2383e1034 100644
--- a/c-blosc/LICENSES/LZ4.txt
+++ b/c-blosc/LICENSES/LZ4.txt
@@ -1,6 +1,6 @@
 LZ4 - Fast LZ compression algorithm
 
-Copyright (C) 2011-2013, Yann Collet.
+Copyright (C) 2011-2014, Yann Collet.
 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
 Redistribution and use in source and binary forms, with or without
diff --git a/c-blosc/LICENSES/STDINT.txt b/c-blosc/LICENSES/STDINT.txt
index 7e9941ad3..c28001d13 100644
--- a/c-blosc/LICENSES/STDINT.txt
+++ b/c-blosc/LICENSES/STDINT.txt
@@ -1,4 +1,7 @@
-Copyright (c) 2006-2008 Alexander Chemeris
+ISO C9x  compliant stdint.h for Microsoft Visual Studio
+Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+
+ Copyright (c) 2006-2013 Alexander Chemeris
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
@@ -10,8 +13,9 @@ modification, are permitted provided that the following conditions are met:
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
 
-  3. The name of the author may be used to endorse or promote products
-     derived from this software without specific prior written permission.
+  3. Neither the name of the product nor the names of its contributors may
+     be used to endorse or promote products derived from this software
+     without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
@@ -22,4 +26,4 @@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/c-blosc/README.rst b/c-blosc/README.rst
index 0e3f94ab7..92fe041a3 100644
--- a/c-blosc/README.rst
+++ b/c-blosc/README.rst
@@ -3,8 +3,17 @@
 ===============================================================
 
 :Author: Francesc Alted
-:Contact: francesc@blosc.io
+:Contact: francesc@blosc.org
 :URL: http://www.blosc.org
+:Travis CI: |travis|
+:Appveyor: |appveyor|
+
+.. |travis| image:: https://travis-ci.org/Blosc/c-blosc.svg?branch=master
+        :target: https://travis-ci.org/Blosc/c-blosc
+
+.. |appveyor| image:: https://ci.appveyor.com/api/projects/status/gccmb03j8ghbj0ig/branch/master?svg=true
+        :target: https://ci.appveyor.com/project/FrancescAlted/c-blosc/branch/master
+
 
 What is it?
 ===========
@@ -21,18 +30,19 @@ activity on the memory bus as much as possible. In short, this
 technique works by dividing datasets in blocks that are small enough
 to fit in caches of modern processors and perform compression /
 decompression there.  It also leverages, if available, SIMD
-instructions (SSE2) and multi-threading capabilities of CPUs, in order
-to accelerate the compression / decompression process to a maximum.
+instructions (SSE2, AVX2) and multi-threading capabilities of CPUs, in
+order to accelerate the compression / decompression process to a
+maximum.
 
-Blosc is actually a metacompressor, that meaning that it can use a range
-of compression libraries for performing the actual
+Blosc is actually a metacompressor, that meaning that it can use a
+range of compression libraries for performing the actual
 compression/decompression. Right now, it comes with integrated support
-for BloscLZ (the original one), LZ4, LZ4HC, Snappy and Zlib. Blosc comes
-with full sources for all compressors, so in case it does not find the
-libraries installed in your system, it will compile from the included
-sources and they will be integrated into the Blosc library anyway. That
-means that you can trust in having all supported compressors integrated
-in Blosc in all supported platforms.
+for BloscLZ (the original one), LZ4, LZ4HC, Snappy and Zlib. Blosc
+comes with full sources for all compressors, so in case it does not
+find the libraries installed in your system, it will compile from the
+included sources and they will be integrated into the Blosc library
+anyway. That means that you can trust in having all supported
+compressors integrated in Blosc in all supported platforms.
 
 You can see some benchmarks about Blosc performance in [3]_
 
@@ -41,32 +51,32 @@ details.
 
 .. [1] http://www.blosc.org
 .. [2] http://blosc.org/docs/StarvingCPUs-CISE-2010.pdf
-.. [3] http://blosc.org/trac/wiki/SyntheticBenchmarks
+.. [3] http://blosc.org/synthetic-benchmarks.html
 
 Meta-compression and other advantages over existing compressors
 ===============================================================
 
-Blosc is not like other compressors: it should rather be called a
+C-Blosc is not like other compressors: it should rather be called a
 meta-compressor.  This is so because it can use different compressors
-and pre-conditioners (programs that generally improve compression
-ratio).  At any rate, it can also be called a compressor because it
-happens that it already integrates one compressor and one
-pre-conditioner, so it can actually work like so.
-
-Currently it comes with support of BloscLZ, a compressor heavily based
-on FastLZ (http://fastlz.org/), LZ4 and LZ4HC
-(http://fastcompression.blogspot.com.es/p/lz4.html), Snappy
+and filters (programs that generally improve compression ratio).  At
+any rate, it can also be called a compressor because it happens that
+it already comes with several compressor and filters, so it can
+actually work like so.
+
+Currently C-Blosc comes with support of BloscLZ, a compressor heavily
+based on FastLZ (http://fastlz.org/), LZ4 and LZ4HC
+(https://github.com/Cyan4973/lz4), Snappy
 (https://github.com/google/snappy) and Zlib (http://www.zlib.net/), as
-well as a highly optimized (it can use SSE2 instructions, if
-available) Shuffle pre-conditioner (for info on how it works, see
-slide 17 of http://www.slideshare.net/PyData/blosc-py-data-2014).
-However, different compressors or pre-conditioners may be added in the
-future.
-
-Blosc is in charge of coordinating the compressor and pre-conditioners
-so that they can leverage the blocking technique (described above) as
-well as multi-threaded execution (if several cores are available)
-automatically. That makes that every compressor and pre-conditioner
+well as a highly optimized (it can use SSE2 or AVX2 instructions, if
+available) shuffle and bitshuffle filters (for info on how and why
+shuffling works, see slide 17 of
+http://www.slideshare.net/PyData/blosc-py-data-2014).  However,
+different compressors or filters may be added in the future.
+
+C-Blosc is in charge of coordinating the different compressor and
+filters so that they can leverage the blocking technique (described
+above) as well as multi-threaded execution (if several cores are
+available) automatically. That makes that every compressor and filter
 will work at very high speeds, even if it was not initially designed
 for doing blocking or multi-threading.
 
@@ -74,15 +84,15 @@ Other advantages of Blosc are:
 
 * Meant for binary data: can take advantage of the type size
   meta-information for improved compression ratio (using the
-  integrated shuffle pre-conditioner).
+  integrated shuffle and bitshuffle filters).
 
 * Small overhead on non-compressible data: only a maximum of (16 + 4 *
   nthreads) additional bytes over the source buffer length are needed
-  to compress *every* input.
+  to compress *any kind of input*.
 
-* Maximum destination length: contrarily to many other
-  compressors, both compression and decompression routines have
-  support for maximum size lengths for the destination buffer.
+* Maximum destination length: contrarily to many other compressors,
+  both compression and decompression routines have support for maximum
+  size lengths for the destination buffer.
 
 When taken together, all these features set Blosc apart from other
 similar solutions.
@@ -90,65 +100,87 @@ similar solutions.
 Compiling your application with a minimalistic Blosc
 ====================================================
 
-The minimal Blosc consists of the next files (in blosc/ directory)::
+The minimal Blosc consists of the next files (in `blosc/ directory
+<https://github.com/Blosc/c-blosc/tree/master/blosc>`_)::
 
-    blosc.h and blosc.c      -- the main routines
-    shuffle.h and shuffle.c  -- the shuffle code
-    blosclz.h and blosclz.c  -- the blosclz compressor
+    blosc.h and blosc.c        -- the main routines
+    shuffle*.h and shuffle*.c  -- the shuffle code
+    blosclz.h and blosclz.c    -- the blosclz compressor
 
 Just add these files to your project in order to use Blosc.  For
-information on compression and decompression routines, see blosc.h.
+information on compression and decompression routines, see `blosc.h
+<https://github.com/Blosc/c-blosc/blob/master/blosc/blosc.h>`_.
 
-To compile using GCC (4.4 or higher recommended) on Unix:
+To compile using GCC (4.9 or higher recommended) on Unix:
 
 .. code-block:: console
 
-   $ gcc -O3 -msse2 -o myprog myprog.c blosc/*.c -lpthread
+   $ gcc -O3 -mavx2 -o myprog myprog.c blosc/*.c -Iblosc -lpthread
 
 Using Windows and MINGW:
 
 .. code-block:: console
 
-   $ gcc -O3 -msse2 -o myprog myprog.c blosc\*.c
+   $ gcc -O3 -mavx2 -o myprog myprog.c -Iblosc blosc\*.c
 
-Using Windows and MSVC (2010 or higher recommended):
+Using Windows and MSVC (2013 or higher recommended):
 
 .. code-block:: console
 
-  $ cl /Ox /Femyprog.exe myprog.c blosc\*.c
+  $ cl /Ox /Femyprog.exe /Iblosc myprog.c blosc\*.c
 
-A simple usage example is the benchmark in the bench/bench.c file.
-Another example for using Blosc as a generic HDF5 filter is in the
-hdf5/ directory.
+In the `examples/ directory
+<https://github.com/Blosc/c-blosc/tree/master/examples>`_ you can find
+more hints on how to link your app with Blosc.
 
 I have not tried to compile this with compilers other than GCC, clang,
 MINGW, Intel ICC or MSVC yet. Please report your experiences with your
 own platforms.
 
-Adding support for other compressors (LZ4, LZ4HC, Snappy, Zlib)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Adding support for other compressors with a minimalistic Blosc
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-If you want to add support for the LZ4, LZ4HC, Snappy or Zlib
-compressors, just add the symbols HAVE_LZ4 (will include both LZ4 and
-LZ4HC), HAVE_SNAPPY and HAVE_ZLIB during compilation and add the
-libraries. For example, for compiling Blosc with Zlib support do:
+The official cmake files (see below) for Blosc try hard to include
+support for LZ4, LZ4HC, Snappy, Zlib inside the Blosc library, so
+using them is just a matter of calling the appropriate
+`blosc_set_compressor() API call
+<https://github.com/Blosc/c-blosc/blob/master/blosc/blosc.h>`_.  See
+an `example here
+<https://github.com/Blosc/c-blosc/blob/master/examples/many_compressors.c>`_.
+
+Having said this, it is also easy to use a minimalistic Blosc and just
+add the symbols HAVE_LZ4 (will include both LZ4 and LZ4HC),
+HAVE_SNAPPY and HAVE_ZLIB during compilation as well as the
+appropriate libraries. For example, for compiling with minimalistic
+Blosc but with added Zlib support do:
 
 .. code-block:: console
 
-   $ gcc -O3 -msse2 -o myprog myprog.c blosc/*.c -lpthread -DHAVE_ZLIB -lz
+   $ gcc -O3 -msse2 -o myprog myprog.c blosc/*.c -Iblosc -lpthread -DHAVE_ZLIB -lz
+
+In the `bench/ directory
+<https://github.com/Blosc/c-blosc/tree/master/bench>`_ there a couple
+of Makefile files (one for UNIX and the other for MinGW) with more
+complete building examples, like switching between libraries or
+internal sources for the compressors.
+
+Supported platforms
+~~~~~~~~~~~~~~~~~~~
 
-In the bench/ directory there a couple of Makefile files (one for UNIX
-and the other for MinGW) with more complete building examples, like
-selecting between libraries or internal sources for the compressors.
+Blosc is meant to support all platforms where a C89 compliant C
+compiler can be found.  The ones that are mostly tested are Intel
+(Linux, Mac OSX and Windows) and ARM (Linux), but exotic ones as IBM
+Blue Gene Q embedded "A2" processor are reported to work too.
 
 Compiling the Blosc library with CMake
 ======================================
 
 Blosc can also be built, tested and installed using CMake_. Although
-this procedure is a bit more invloved than the one described above, it
-is the most general because it allows to integrate other compressors
-than BloscLZ either from libraries or from internal sources. Hence,
-serious library developers should use this way.
+this procedure might seem a bit more involved than the one described
+above, it is the most general because it allows to integrate other
+compressors than BloscLZ either from libraries or from internal
+sources. Hence, serious library developers are encouraged to use this
+way.
 
 The following procedure describes the "out of source" build.
 
@@ -180,9 +212,9 @@ Build, test and install Blosc:
 
 .. code-block:: console
 
-  $ make
-  $ make test
-  $ make install
+  $ cmake --build .
+  $ ctest
+  $ cmake --build . --target install
 
 The static and dynamic version of the Blosc library, together with
 header files, will be installed into the specified
@@ -190,13 +222,18 @@ CMAKE_INSTALL_PREFIX.
 
 .. _CMake: http://www.cmake.org
 
+Once you have compiled your Blosc library, you can easily link your
+apps with it as shown in the `example/ directory
+<https://github.com/Blosc/c-blosc/blob/master/examples>`_.
+
 Adding support for other compressors (LZ4, LZ4HC, Snappy, Zlib) with CMake
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The CMake files in Blosc are configured to automatically detect other
 compressors like LZ4, LZ4HC, Snappy or Zlib by default.  So as long as
 the libraries and the header files for these libraries are accessible,
-these will be used by default.
+these will be used by default.  See an `example here
+<https://github.com/Blosc/c-blosc/blob/master/examples/many_compressors.c>`_.
 
 *Note on Zlib*: the library should be easily found on UNIX systems,
 although on Windows, you can help CMake to find it by setting the
@@ -207,18 +244,17 @@ directories are. Also, make sure that Zlib DDL library is in your
 However, the full sources for LZ4, LZ4HC, Snappy and Zlib have been
 included in Blosc too. So, in general, you should not worry about not
 having (or CMake not finding) the libraries in your system because in
-this case, their sources will be automaticall compiled for you. That
+this case, their sources will be automatically compiled for you. That
 effectively means that you can be confident in having a complete
 support for all the supported compression libraries in all supported
 platforms.
 
-If you want to force Blosc to use the included compression sources
-instead of trying to find the libraries in the system first, you can
-switch off the PREFER_EXTERNAL_COMPLIBS CMake option:
+If you want to force Blosc to use external libraries instead of
+the included compression sources:
 
 .. code-block:: console
 
-  $ cmake -DPREFER_EXTERNAL_COMPLIBS=OFF ..
+  $ cmake -DPREFER_EXTERNAL_LZ4=ON ..
 
 You can also disable support for some compression libraries:
 
@@ -227,7 +263,7 @@ You can also disable support for some compression libraries:
   $ cmake -DDEACTIVATE_SNAPPY=ON ..
 
 Mac OSX troubleshooting
-=======================
+~~~~~~~~~~~~~~~~~~~~~~~
 
 If you run into compilation troubles when using Mac OSX, please make
 sure that you have installed the command line developer tools.  You
@@ -254,8 +290,10 @@ https://github.com/Blosc/bloscpack
 Filter for HDF5
 ===============
 
-For those that want to use Blosc as a filter in the HDF5 library,
-there is a sample implementation in the hdf5/ directory.
+For those who want to use Blosc as a filter in the HDF5 library,
+there is a sample implementation in the blosc/hdf5 project in:
+
+https://github.com/Blosc/hdf5
 
 Mailing list
 ============
@@ -268,23 +306,7 @@ http://groups.google.es/group/blosc
 Acknowledgments
 ===============
 
-I'd like to thank the PyTables community that have collaborated in the
-exhaustive testing of Blosc.  With an aggregate amount of more than
-300 TB of different datasets compressed *and* decompressed
-successfully, I can say that Blosc is pretty safe now and ready for
-production purposes.
-
-Other important contributions:
-
-* Valentin Haenel did a terrific work implementing the support for the
-  Snappy compression, fixing typos and improving docs and the plotting
-  script.
-
-* Thibault North, with ideas from Oscar Villellas, contributed a way
-  to call Blosc from different threads in a safe way.
-
-* The CMake support was initially contributed by Thibault North, and
-  Antonio Valentino and Mark Wiebe made great enhancements to it.
+See THANKS.rst.
 
 
 ----
diff --git a/c-blosc/README_HEADER.rst b/c-blosc/README_HEADER.rst
index d2428f1b5..79b9519e2 100644
--- a/c-blosc/README_HEADER.rst
+++ b/c-blosc/README_HEADER.rst
@@ -22,14 +22,14 @@ All entries are little endian.
 :versionlz:
     (``uint8``) Version of the internal compressor used.
 :flags and compressor enumeration:
-    (``bitfield``) The flags of the buffer 
+    (``bitfield``) The flags of the buffer
 
     :bit 0 (``0x01``):
-        Whether the shuffle filter has been applied or not.
+        Whether the byte-shuffle filter has been applied or not.
     :bit 1 (``0x02``):
         Whether the internal buffer is a pure memcpy or not.
     :bit 2 (``0x04``):
-        Reserved
+        Whether the bit-shuffle filter has been applied or not.
     :bit 3 (``0x08``):
         Reserved
     :bit 4 (``0x16``):
diff --git a/c-blosc/RELEASE_NOTES.rst b/c-blosc/RELEASE_NOTES.rst
index 4c9e6ced1..75af4f483 100644
--- a/c-blosc/RELEASE_NOTES.rst
+++ b/c-blosc/RELEASE_NOTES.rst
@@ -1,37 +1,259 @@
-================================
- Release notes for c-blosc 1.4.4
-================================
+=================================
+ Release notes for C-Blosc 1.8.1
+=================================
 
 :Author: Francesc Alted
-:Contact: faltet@blosc.org
+:Contact: francesc@blosc.org
 :URL: http://www.blosc.org
 
 
-Changes from 1.4.3 to 1.4.4
+Changes from 1.8.0 to 1.8.1
 ===========================
 
-* New computation of blocksize to be in sync with c-blosc 1.6.1
+* Disable the use of __builtin_cpu_supports() for GCC 5.3.1
+  compatibility.  Details in:
+  https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
 
-* New parametrization of the hash table for blosclz (synced with c-blosc
-  1.6.1)
 
+Changes from 1.7.1 to 1.8.0
+===========================
+
+* The code is (again) compatible with VS2008 and VS2010.  This is
+  important for compatibility with Python 2.6/2.7/3.3/3.4.
+
+* Introduced a new global lock during blosc_decompress() operation.
+  As the blosc_compress() was already guarded by a global lock, this
+  means that the compression/decompression is again thread safe.
+  However, when using C-Blosc from multi-threaded environments, it is
+  important to keep using the *_ctx() functions for performance
+  reasons.  NOTE: _ctx() functions will be replaced by more powerful
+  ones in C-Blosc 2.0.
+
+
+Changes from 1.7.0 to 1.7.1
+===========================
+
+* Fixed a bug preventing bitshuffle to work correctly on getitem().
+  Now, everything with bitshuffle seems to work correctly.
+
+* Fixed the thread initialization for blosc_decompress_ctx().  Issue
+  #158.  Thanks to Chris Webers.
+
+* Fixed a bug in the blocksize computation introduced in 1.7.0.  This
+  could have been creating segfaults.
+
+* Allow bitshuffle to run on 1-byte typesizes.
+
+* New parametrization of the blocksize to be independent of the
+  typesize.  This allows a smoother speed throughout all typesizes.
 
-Changes from 1.4.2 to 1.4.3
+* lz4 and lz4hc codecs upgraded to 1.7.2 (from 1.7.0).
+
+* When calling set_nthreads() but not actually changing the number of
+  threads in the internal pool does not teardown and setup it anymore.
+  PR #153.  Thanks to Santi Villalba.
+
+
+Changes from 1.6.1 to 1.7.0
 ===========================
 
+* Added a new 'bitshuffle' filter so that the shuffle takes place at a
+  bit level and not just at a byte one, which is what it does the
+  previous 'shuffle' filter.
+
+  For activating this new bit-level filter you only have to pass the
+  symbol BLOSC_BITSHUFFLE to `blosc_compress()`.  For the previous
+  byte-level one, pass BLOSC_SHUFFLE.  For disabling the shuffle, pass
+  BLOSC_NOSHUFFLE.
+
+  This is a port of the existing filter in
+  https://github.com/kiyo-masui/bitshuffle.  Thanks to Kiyo Masui for
+  changing the license and allowing its inclusion here.
+
+* New acceleration mode for LZ4 and BloscLZ codecs that enters in
+  operation with complevel < 9.  This allows for an important boost in
+  speed with minimal compression ratio loss.  Francesc Alted.
+
+* LZ4 codec updated to 1.7.0 (r130).
+
+* PREFER_EXTERNAL_COMPLIBS cmake option has been removed and replaced
+  by the more fine grained PREFER_EXTERNAL_LZ4, PREFER_EXTERNAL_SNAPPY
+  and PREFER_EXTERNAL_ZLIB.  In order to allow the use of the new API
+  introduced in LZ4 1.7.0, PREFER_EXTERNAL_LZ4 has been set to OFF by
+  default, whereas PREFER_EXTERNAL_SNAPPY and PREFER_EXTERNAL_ZLIB
+  continues to be ON.
+
+* Implemented SSE2 shuffle support for buffers containing a number of
+  elements which is not a multiple of (typesize * vectorsize).  Jack
+  Pappas.
+
+* Added SSE2 shuffle/unshuffle routines for types larger than 16
+  bytes.  Jack Pappas.
+
+* 'test_basic' suite has been split in components for a much better
+  granularity on what's a possibly failing test.  Also, lots of new
+  tests have been added.  Jack Pappas.
+
+* Fixed compilation on non-Intel archs (tested on ARM).  Zbyszek
+  Szmek.
+
+* Modifyied cmake files in order to inform that AVX2 on Visual Studio
+  is supported only in 2013 update 2 and higher.
+
+* Added a replacement for stdbool.h for Visual Studio < 2013.
+
+* blosclz codec adds Win64/Intel as a platform supporting unaligned
+  addressing.  That leads to a speed-up of 2.2x in decompression.
+
+* New blosc_get_version_string() function for retrieving the version
+  of the c-blosc library.  Useful when linking with dynamic libraries
+  and one want to know its version.
+
+* New example (win-dynamic-linking.c) that shows how to link a Blosc
+  DLL dynamically in run-time (Windows only).
+
+* The `context.threads_started` is initialized now when decompressing.
+  This could cause crashes in case you decompressed before compressing
+  (e.g. directly deserializing blosc buffers).  @atchouprakov.
+
+* The HDF5 filter has been removed from c-blosc and moved into its own
+  repo at: https://github.com/Blosc/hdf5
+
+* The MS Visual Studio 2008 has been tested with c-blosc for ensuring
+  compatibility with extensions for Python 2.6 and up.
+
+
+Changes from 1.6.0 to 1.6.1
+===========================
+
+* Support for *runtime* detection of AVX2 and SSE2 SIMD instructions.
+  These changes make it possible to compile one single binary that
+  runs on a system that supports SSE2 or AVX2 (or neither), so the
+  redistribution problem is fixed (see #101).  Thanks to Julian Taylor
+  and Jack Pappas.
+
+* Added support for MinGW and TDM-GCC compilers for Windows.  Thanks
+  to yasushima-gd.
+
 * Fixed a bug in blosclz that could potentially overwrite an area
   beyond the output buffer.  See #113.
 
+* New computation for blocksize so that larger typesizes (> 8 bytes)
+  would benefit of much better compression ratios.  Speed is not
+  penalized too much.
+
+* New parametrization of the hash table for blosclz codec.  This
+  allows better compression in many scenarios, while slightly
+  increasing the speed.
+
+
+Changes from 1.5.4 to 1.6.0
+===========================
+
+* Support for AVX2 is here!  The benchmarks with a 4-core Intel
+  Haswell machine tell that both compression and decompression are
+  accelerated around a 10%, reaching peaks of 9.6 GB/s during
+  compression and 26 GB/s during decompression (memcpy() speed for
+  this machine is 7.5 GB/s for writes and 11.7 GB/s for reads).  Many
+  thanks to @littlezhou for this nice work.
+
+* Support for HPET (high precision timers) for the `bench` program.
+  This is particularly important for microbenchmarks like bench is
+  doing; since they take so little time to run, the granularity of a
+  less-accurate timer may account for a significant portion of the
+  runtime of the benchmark itself, skewing the results.  Thanks to
+  Jack Pappas.
 
-Changes from 1.4.1 to 1.4.2
+
+Changes from 1.5.3 to 1.5.4
+===========================
+
+* Updated to LZ4 1.6.0 (r128).
+
+* Fix resource leak in t_blosc.  Jack Pappas.
+
+* Better checks during testing.  Jack Pappas.
+
+* Dynamically loadable HDF5 filter plugin. Kiyo Masui.
+
+
+Changes from 1.5.2 to 1.5.3
+===========================
+
+* Use llabs function (where available) instead of abs to avoid
+  truncating the result.  Jack Pappas.
+
+* Use C11 aligned_alloc when it's available.  Jack Pappas.
+
+* Use the built-in stdint.h with MSVC when available.  Jack Pappas.
+
+* Only define the __SSE2__ symbol when compiling with MS Visual C++
+  and targeting x64 or x86 with the correct /arch flag set. This
+  avoids re-defining the symbol which makes other compilers issue
+  warnings.  Jack Pappas.
+
+* Reinitializing Blosc during a call to set_nthreads() so as to fix
+  problems with contexts.  Francesc Alted.
+
+
+
+Changes from 1.5.1 to 1.5.2
+===========================
+
+* Using blosc_compress_ctx() / blosc_decompress_ctx() inside the HDF5
+  compressor for allowing operation in multiprocess scenarios.  See:
+  https://github.com/PyTables/PyTables/issues/412
+
+  The drawback of this quick fix is that the Blosc filter will be only
+  able to use a single thread until another solution can be devised.
+
+
+Changes from 1.5.0 to 1.5.1
 ===========================
 
-* The implementation of H5Epush function has been modified in
-  hdf5-1.8.15 and there is a fix for this.
+* Updated to LZ4 1.5.0.  Closes #74.
 
-* Use multithreaded or single-threaded code depending on Blosc version.
-  This is mainly useful for compiling HDF5 apps with non-included Blosc
-  libraries.
+* Added the 'const' qualifier to non SSE2 shuffle functions. Closes #75.
+
+* Explicitly call blosc_init() in HDF5 blosc_filter.c, fixing a
+  segfault.
+
+* Quite a few improvements in cmake files for HDF5 support.  Thanks to
+  Dana Robinson (The HDF Group).
+
+* Variable 'class' caused problems compiling the HDF5 filter with g++.
+  Thanks to Laurent Chapon.
+
+* Small improvements on docstrings of c-blosc main functions.
+
+
+Changes from 1.4.1 to 1.5.0
+===========================
+
+* Added new calls for allowing Blosc to be used *simultaneously*
+  (i.e. lock free) from multi-threaded environments.  The new
+  functions are:
+
+  - blosc_compress_ctx(...)
+  - blosc_decompress_ctx(...)
+
+  See the new docstrings in blosc.h for how to use them.  The previous
+  API should be completely unaffected.  Thanks to Christopher Speller.
+
+* Optimized copies during BloscLZ decompression.  This can make BloscLZ
+  to decompress up to 1.5x faster in some situations.
+
+* LZ4 and LZ4HC compressors updated to version 1.3.1.
+
+* Added an examples directory on how to link apps with Blosc.
+
+* stdlib.h moved from blosc.c to blosc.h as suggested by Rob Lathm.
+
+* Fix a warning for {snappy,lz4}-free compilation.  Thanks to Andrew Schaaf.
+
+* Several improvements for CMakeLists.txt (cmake).
+
+* Fixing C99 compatibility warnings.  Thanks to Christopher Speller.
 
 
 Changes from 1.4.0 to 1.4.1
@@ -61,7 +283,7 @@ Changes from 1.3.5 to 1.3.6
 
 * Updated to LZ4 r118 due to a (highly unlikely) security hole.  For
   details see:
-
+ 
   http://fastcompression.blogspot.fr/2014/06/debunking-lz4-20-years-old-bug-myth.html
 
 
@@ -391,11 +613,3 @@ Changes from 0.8.0 to 0.9
   necessary on Mac because 16 bytes alignment is ensured by default.
   Thanks to Ivan Vilata.  Fixes #3.
 
-
-
-
-.. Local Variables:
-.. mode: rst
-.. coding: utf-8
-.. fill-column: 72
-.. End:
diff --git a/c-blosc/RELEASING.rst b/c-blosc/RELEASING.rst
index 61b7d1b88..679247075 100644
--- a/c-blosc/RELEASING.rst
+++ b/c-blosc/RELEASING.rst
@@ -3,7 +3,7 @@ Releasing Blosc
 ================
 
 :Author: Francesc Alted
-:Contact: francesc@blosc.io
+:Contact: francesc@blosc.org
 :Date: 2014-01-15
 
 
@@ -21,12 +21,12 @@ Testing
 Create a new build/ directory, change into it and issue::
 
   $ cmake ..
-  $ make
-  $ make test
+  $ cmake --build .
+  $ ctest
 
 To actually test Blosc the hard way, look at the end of:
 
-http://blosc.org/trac/wiki/SyntheticBenchmarks
+http://blosc.org/synthetic-benchmarks.html
 
 where instructions on how to intensively test (and benchmark) Blosc
 are given.
@@ -47,14 +47,11 @@ Tagging
 Announcing
 ----------
 
-- Update the release notes in the github wiki:
-
-https://github.com/Blosc/c-blosc/wiki/Release-notes
-
 - Send an announcement to the blosc, pytables-dev, bcolz and
   comp.compression lists.  Use the ``ANNOUNCE.rst`` file as skeleton
   (possibly as the definitive version).
 
+
 Post-release actions
 --------------------
 
diff --git a/c-blosc/THANKS.rst b/c-blosc/THANKS.rst
index 529464649..548862a92 100644
--- a/c-blosc/THANKS.rst
+++ b/c-blosc/THANKS.rst
@@ -22,7 +22,14 @@ Other important contributions:
   avoid the use of the blosc_init() and blosc_destroy().
 
 * Jack Pappas contributed important portability enhancements,
-  specially with MS Visual C++ as well as high precision timers
-  (HPET) for the benchmark program.
+  specially runtime and cross-platform detection of SSE2/AVX2 as well
+  as high precision timers (HPET) for the benchmark program.
 
-* @littlezhou contributed the AVX2 version of shuffle routines.
+* @littlezhou implemented the AVX2 version of shuffle routines.
+
+* Julian Taylor contributed a way to detect AVX2 in runtime and
+  calling the appropriate routines only if the undelying hardware
+  supports it.
+
+* Kiyo Masui for relicensing his bitshuffle project for allowing the
+  inclusion of part of his code in Blosc.
diff --git a/c-blosc/appveyor.yml b/c-blosc/appveyor.yml
new file mode 100644
index 000000000..54ecbf3b2
--- /dev/null
+++ b/c-blosc/appveyor.yml
@@ -0,0 +1,37 @@
+# AppVeyor CI build configuration for c-blosc
+
+# Before cloning the repo, configure git to handle line endings correctly.
+init:
+  - git config --global core.autocrlf input
+
+# This is the build version displayed on AppVeyor's UI.
+# It's incrementally automatically like travis-ci but allows custom formatting.
+version: '{build}'
+
+environment:
+  matrix:
+    - GENERATOR: "Visual Studio 9 2008"
+      CONFIG: Release
+
+    - GENERATOR: "Visual Studio 9 2008 Win64"
+      CONFIG: Release
+
+    - GENERATOR: "Visual Studio 10 2010"
+      CONFIG: Release
+
+    - GENERATOR: "Visual Studio 10 2010 Win64"
+      CONFIG: Release
+
+    - GENERATOR: "Visual Studio 12 2013"
+      CONFIG: Release
+
+    - GENERATOR: "Visual Studio 12 Win64"
+      CONFIG: Release
+
+build_script:
+  - cmake "-G%GENERATOR%" -H. -B_builds
+  - cmake --build _builds --config "%CONFIG%"
+
+test_script:
+  - ps: cd _builds
+  - ctest -VV -C "%CONFIG%"
diff --git a/c-blosc/bench/CMakeLists.txt b/c-blosc/bench/CMakeLists.txt
index b79e623d4..7a10be2a7 100644
--- a/c-blosc/bench/CMakeLists.txt
+++ b/c-blosc/bench/CMakeLists.txt
@@ -4,69 +4,108 @@ set(SOURCES bench.c)
 
 # targets
 add_executable(bench ${SOURCES})
+if(UNIX AND NOT APPLE)
+  # cmake is complaining about LINK_PRIVATE in original PR
+  # and removing it does not seem to hurt, so be it.
+  # target_link_libraries(bench LINK_PRIVATE rt)
+  target_link_libraries(bench rt)
+endif(UNIX AND NOT APPLE)
 target_link_libraries(bench blosc_shared)
 
+# have to copy blosc dlls on Windows
+if(MSVC)
+    add_custom_command(
+        TARGET      bench
+        POST_BUILD
+        COMMAND     ${CMAKE_COMMAND}
+        ARGS        -E copy_if_different
+                    "${PROJECT_BINARY_DIR}/blosc/\$\(Configuration\)/blosc.dll"
+                    "${CMAKE_CURRENT_BINARY_DIR}/\$\(Configuration\)/blosc.dll")
+elseif(MINGW)
+    add_custom_command(
+        TARGET      bench
+        POST_BUILD
+        COMMAND     ${CMAKE_COMMAND}
+        ARGS        -E copy_if_different
+                    "${PROJECT_BINARY_DIR}/blosc/libblosc.dll"
+                    "${CMAKE_CURRENT_BINARY_DIR}/libblosc.dll")
+endif()
 
 # tests
 if(BUILD_TESTS)
 
-    option(TEST_INCLUDE_BENCH_SINGLE_1 "Include bench single (1 thread) in the tests" ON)
-    if(TEST_INCLUDE_BENCH_SINGLE_1)
-        add_test(test_blosclz_1 bench blosclz single 1)
+    # The commented tests below take too much time to complete
+    option(TEST_INCLUDE_BENCH_SHUFFLE_1 "Include bench shuffle (1 thread) in the tests" ON)
+    if(TEST_INCLUDE_BENCH_SHUFFLE_1)
+        set(SHUFFLE_1_OPTS shuffle test 1)
+        add_test(test_blosclz_shuffle_1 bench blosclz ${SHUFFLE_1_OPTS})
         if (HAVE_LZ4)
-            add_test(test_lz4_1     bench lz4     single 1)
-            add_test(test_lz4hc_1   bench lz4hc   single 1)
+            add_test(test_lz4_shuffle_1 bench lz4 ${SHUFFLE_1_OPTS})
+            # add_test(test_lz4hc_shuffle_1 bench lz4hc ${SHUFFLE_1_OPTS})
         endif (HAVE_LZ4)
         if (HAVE_SNAPPY)
-            add_test(test_snappy_1  bench snappy  single 1)
+            add_test(test_snappy_shuffle_1 bench snappy ${SHUFFLE_1_OPTS})
         endif (HAVE_SNAPPY)
         if (HAVE_ZLIB)
-            add_test(test_zlib_1    bench zlib    single 1)
+            # add_test(test_zlib_shuffle_1 bench zlib ${SHUFFLE_1_OPTS})
         endif (HAVE_ZLIB)
-    endif(TEST_INCLUDE_BENCH_SINGLE_1)
+    endif(TEST_INCLUDE_BENCH_SHUFFLE_1)
 
-    option(TEST_INCLUDE_BENCH_SINGLE_N "Include bench single (multithread) in the tests" ON)
-    if(TEST_INCLUDE_BENCH_SINGLE_N)
-        add_test(test_blosclz_n bench blosclz single)
+    option(TEST_INCLUDE_BENCH_SHUFFLE_N "Include bench shuffle (multithread) in the tests" ON)
+    if(TEST_INCLUDE_BENCH_SHUFFLE_N)
+        set(SHUFFLE_N_OPTS shuffle test)
+        add_test(test_blosclz_shuffle_n bench blosclz ${SHUFFLE_N_OPTS})
         if (HAVE_LZ4)
-            add_test(test_lz4_n     bench lz4     single)
-            add_test(test_lz4hc_n   bench lz4hc   single)
+            add_test(test_lz4_shuffle_n bench lz4 ${SHUFFLE_N_OPTS})
+            add_test(test_lz4hc_shuffle_n bench lz4hc ${SHUFFLE_N_OPTS})
         endif (HAVE_LZ4)
         if (HAVE_SNAPPY)
-            add_test(test_snappy_n  bench snappy  single)
+            add_test(test_snappy_shuffle_n bench snappy ${SHUFFLE_N_OPTS})
         endif (HAVE_SNAPPY)
         if (HAVE_ZLIB)
-            add_test(test_zlib_n    bench zlib    single)
+            add_test(test_zlib_shuffle_n bench zlib ${SHUFFLE_N_OPTS})
         endif (HAVE_ZLIB)
-    endif(TEST_INCLUDE_BENCH_SINGLE_N)
+    endif(TEST_INCLUDE_BENCH_SHUFFLE_N)
 
-    option(TEST_INCLUDE_BENCH_SUITE "Include bench suite in the tests" OFF)
-    if(TEST_INCLUDE_BENCH_SUITE)
-        add_test(test_blosclz bench blosclz suite)
+    option(TEST_INCLUDE_BENCH_BITSHUFFLE_1 "Include bench bitshuffle (1 thread) in the tests" ON)
+    if(TEST_INCLUDE_BENCH_BITSHUFFLE_1)
+        set(BITSHUFFLE_1_OPTS bitshuffle test 1)
+        add_test(test_blosclz_bitshuffle_1 bench blosclz ${BITSHUFFLE_1_OPTS})
         if (HAVE_LZ4)
-            add_test(test_lz4     bench lz4     suite)
-            add_test(test_lz4hc   bench lz4hc   suite)
+            add_test(test_lz4_bitshuffle_1 bench lz4 ${BITSHUFFLE_1_OPTS})
+            # add_test(test_lz4hc_bitshuffle_1 bench lz4hc ${BITSHUFFLE_1_OPTS})
         endif (HAVE_LZ4)
         if (HAVE_SNAPPY)
-            add_test(test_snappy  bench snappy  suite)
+            add_test(test_snappy_bitshuffle_1 bench snappy ${BITSHUFFLE_1_OPTS})
         endif (HAVE_SNAPPY)
         if (HAVE_ZLIB)
-            add_test(test_zlib    bench zlib    suite)
+            # add_test(test_zlib_bitshuffle_1 bench zlib ${BITSHUFFLE_1_OPTS})
         endif (HAVE_ZLIB)
-    endif(TEST_INCLUDE_BENCH_SUITE)
+    endif(TEST_INCLUDE_BENCH_BITSHUFFLE_1)
 
-    option(TEST_INCLUDE_BENCH_HARDSUITE "Include bench hardsuite in the tests" OFF)
-    if(TEST_INCLUDE_BENCH_HARDSUITE)
-        add_test(test_hardsuite blosc blosclz hardsuite)
-    endif(TEST_INCLUDE_BENCH_HARDSUITE)
+    option(TEST_INCLUDE_BENCH_BITSHUFFLE_N "Include bench bitshuffle (multithread) in the tests" ON)
+    if(TEST_INCLUDE_BENCH_BITSHUFFLE_N)
+        set(BITSHUFFLE_N_OPTS bitshuffle test)
+        add_test(test_blosclz_bitshuffle_n bench blosclz ${BITSHUFFLE_N_OPTS})
+        if (HAVE_LZ4)
+            add_test(test_lz4_bitshuffle_n bench lz4 ${BITSHUFFLE_N_OPTS})
+            # add_test(test_lz4hc_bitshuffle_n bench lz4hc ${BITSHUFFLE_N_OPTS})
+        endif (HAVE_LZ4)
+        if (HAVE_SNAPPY)
+            add_test(test_snappy_bitshuffle_n bench snappy ${BITSHUFFLE_N_OPTS})
+        endif (HAVE_SNAPPY)
+        if (HAVE_ZLIB)
+            # add_test(test_zlib_bitshuffle_n bench zlib ${BITSHUFFLE_N_OPTS})
+        endif (HAVE_ZLIB)
+    endif(TEST_INCLUDE_BENCH_BITSHUFFLE_N)
 
-    option(TEST_INCLUDE_BENCH_EXTREMESUITE "Include bench extremesuite in the tests" OFF)
-    if(TEST_INCLUDE_BENCH_EXTREMESUITE)
-        add_test(test_extremesuite bench blosclz extremesuite)
-    endif(TEST_INCLUDE_BENCH_EXTREMESUITE)
+    option(TEST_INCLUDE_BENCH_SUITE "Include bench suite in the tests" OFF)
+    if(TEST_INCLUDE_BENCH_SUITE)
+        add_test(test_hardsuite blosc blosclz shuffle suite)
+    endif(TEST_INCLUDE_BENCH_SUITE)
 
     option(TEST_INCLUDE_BENCH_DEBUGSUITE "Include bench debugsuite in the tests" OFF)
     if(TEST_INCLUDE_BENCH_DEBUGSUITE)
-        add_test(test_debugsuite bench debugsuite)
+        add_test(test_debugsuite bench blosclz shuffle debugsuite)
     endif(TEST_INCLUDE_BENCH_DEBUGSUITE)
 endif(BUILD_TESTS)
diff --git a/c-blosc/bench/Makefile b/c-blosc/bench/Makefile
index 416566404..bb9ad0d77 100644
--- a/c-blosc/bench/Makefile
+++ b/c-blosc/bench/Makefile
@@ -6,7 +6,7 @@ SOURCES = $(wildcard ../blosc/*.c)
 EXECUTABLE = bench
 
 # Support for internal LZ4 and LZ4HC
-LZ4_DIR = ../internal-complibs/lz4-r119
+LZ4_DIR = ../internal-complibs/lz4-1.7.0
 CFLAGS += -DHAVE_LZ4 -I$(LZ4_DIR)
 SOURCES += $(wildcard $(LZ4_DIR)/*.c)
 
diff --git a/c-blosc/bench/Makefile.mingw b/c-blosc/bench/Makefile.mingw
index 87cbaf1cb..552aa9d49 100644
--- a/c-blosc/bench/Makefile.mingw
+++ b/c-blosc/bench/Makefile.mingw
@@ -7,7 +7,7 @@ SOURCES = $(wildcard ../blosc/*.c)
 EXECUTABLE = bench
 
 # Support for internal LZ4
-LZ4_DIR = ../internal-complibs/lz4-r119
+LZ4_DIR = ../internal-complibs/lz4-1.7.0
 CFLAGS += -DHAVE_LZ4 -I$(LZ4_DIR)
 SOURCES += $(wildcard $(LZ4_DIR)/*.c)
 
diff --git a/c-blosc/bench/bench.c b/c-blosc/bench/bench.c
index 303e71e30..d1a28a8bb 100644
--- a/c-blosc/bench/bench.c
+++ b/c-blosc/bench/bench.c
@@ -7,12 +7,17 @@
 
   For usage instructions of this benchmark, please see:
 
-    http://blosc.pytables.org/trac/wiki/SyntheticBenchmarks
+    http://blosc.org/synthetic-benchmarks.html
 
   I'm collecting speeds for different machines, so the output of your
   benchmarks and your processor specifications are welcome!
 
-  Author: Francesc Alted <francesc@blosc.io>
+  Author: Francesc Alted <francesc@blosc.org>
+
+  Note: Compiling this with VS2008 does not work well with cmake.  Here
+  it is a way to compile the benchmark (with added support for LZ4):
+
+  > cl /DHAVE_LZ4 /arch:SSE2 /Ox /Febench.exe /Iblosc /Iinternal-complibs\lz4-1.7.2 bench\bench.c blosc\blosc.c blosc\blosclz.c blosc\shuffle.c blosc\shuffle-sse2.c blosc\shuffle-generic.c blosc\bitshuffle-generic.c blosc\bitshuffle-sse2.c internal-complibs\lz4-1.7.2\*.c
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
@@ -23,26 +28,25 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
-#if defined(_WIN32) && !defined(__MINGW32__)
+#if defined(_WIN32)
+  /* For QueryPerformanceCounter(), etc. */
+  #include <windows.h>
+#elif defined(__MACH__)
+  #include <mach/clock.h>
+  #include <mach/mach.h>
   #include <time.h>
-#else
-  #include <unistd.h>
   #include <sys/time.h>
+#elif defined(__unix__)
+  #include <unistd.h>
+  #if defined(__linux__)
+    #include <time.h>
+  #else
+    #include <sys/time.h>
+  #endif
+#else
+  #error Unable to detect platform.
 #endif
-#include <math.h>
-
 
-struct bench_wrap_args
-{
-  char *compressor;
-  int nthreads;
-  int size;
-  int elsize;
-  int rshift;
-  FILE * output_file;
-};
-
-void *bench_wrap(void * args);
 
 #include "../blosc/blosc.h"
 
@@ -58,71 +62,81 @@ int nchunks = NCHUNKS;
 int niter = 3;                  /* default number of iterations */
 double totalsize = 0.;          /* total compressed/decompressed size */
 
-#if defined(_WIN32) && !defined(__MINGW32__)
-#include <windows.h>
-#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
-  #define DELTA_EPOCH_IN_MICROSECS  11644473600000000Ui64
-#else
-  #define DELTA_EPOCH_IN_MICROSECS  11644473600000000ULL
-#endif
+/* System-specific high-precision timing functions. */
+#if defined(_WIN32)
 
-struct timezone
-{
-  int  tz_minuteswest; /* minutes W of Greenwich */
-  int  tz_dsttime;     /* type of dst correction */
-};
+/* The type of timestamp used on this system. */
+#define blosc_timestamp_t LARGE_INTEGER
 
-int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-  {
-    GetSystemTimeAsFileTime(&ft);
-
-    tmpres |= ft.dwHighDateTime;
-    tmpres <<= 32;
-    tmpres |= ft.dwLowDateTime;
-
-    /*converting file time to unix epoch*/
-    tmpres -= DELTA_EPOCH_IN_MICROSECS;
-    tmpres /= 10;  /*convert into microseconds*/
-    tv->tv_sec = (long)(tmpres / 1000000UL);
-    tv->tv_usec = (long)(tmpres % 1000000UL);
-  }
+/* Set a timestamp value to the current time. */
+void blosc_set_timestamp(blosc_timestamp_t* timestamp) {
+  /* Ignore the return value, assume the call always succeeds. */
+  QueryPerformanceCounter(timestamp);
+}
 
-  if (NULL != tz)
-  {
-    if (!tzflag)
-    {
-      _tzset();
-      tzflag++;
-    }
-    tz->tz_minuteswest = _timezone / 60;
-    tz->tz_dsttime = _daylight;
-  }
+/* Given two timestamp values, return the difference in microseconds. */
+double blosc_elapsed_usecs(blosc_timestamp_t start_time, blosc_timestamp_t end_time) {
+  LARGE_INTEGER CounterFreq;
+  QueryPerformanceFrequency(&CounterFreq);
 
-  return 0;
+  return (double)(end_time.QuadPart - start_time.QuadPart) / ((double)CounterFreq.QuadPart / 1e6);
 }
-#endif   /* _WIN32 */
 
+#else
 
-/* Given two timeval stamps, return the difference in seconds */
-float getseconds(struct timeval last, struct timeval current) {
-  int sec, usec;
+/* The type of timestamp used on this system. */
+#define blosc_timestamp_t struct timespec
+
+/* Set a timestamp value to the current time. */
+void blosc_set_timestamp(blosc_timestamp_t* timestamp) {
+#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
+  clock_serv_t cclock;
+  mach_timespec_t mts;
+  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+  clock_get_time(cclock, &mts);
+  mach_port_deallocate(mach_task_self(), cclock);
+  timestamp->tv_sec = mts.tv_sec;
+  timestamp->tv_nsec = mts.tv_nsec;
+#else
+  clock_gettime(CLOCK_MONOTONIC, timestamp);
+#endif
+}
+
+/* Given two timestamp values, return the difference in microseconds. */
+double blosc_elapsed_usecs(blosc_timestamp_t start_time, blosc_timestamp_t end_time) {
+	return (1e6 * (end_time.tv_sec - start_time.tv_sec))
+		+ (1e-3 * (end_time.tv_nsec - start_time.tv_nsec));
+}
+
+#endif
 
-  sec = current.tv_sec - last.tv_sec;
-  usec = current.tv_usec - last.tv_usec;
-  return (float)(((double)sec + usec*1e-6));
+/* Given two timeval stamps, return the difference in seconds */
+double getseconds(blosc_timestamp_t last, blosc_timestamp_t current) {
+  return 1e-6 * blosc_elapsed_usecs(last, current);
 }
 
 /* Given two timeval stamps, return the time per chunk in usec */
-float get_usec_chunk(struct timeval last, struct timeval current) {
-  return (float)(getseconds(last, current)/(niter*nchunks)*1e6);
+double get_usec_chunk(blosc_timestamp_t last, blosc_timestamp_t current, int niter, size_t nchunks) {
+  double elapsed_usecs = (double)blosc_elapsed_usecs(last, current);
+  return elapsed_usecs / (double)(niter * nchunks);
 }
 
+/* Define posix_memalign for Windows */
+#if defined(_WIN32)
+#include <malloc.h>
+
+int posix_memalign(void **memptr, size_t alignment, size_t size)
+{
+	*memptr = _aligned_malloc(size, alignment);
+	return 0;
+}
+
+/* Buffers allocated with _aligned_malloc need to be freed with _aligned_free. */
+#define aligned_free(memptr) _aligned_free(memptr)
+#else
+/* If not using MSVC, aligned memory can be freed in the usual way. */
+#define aligned_free(memptr) free(memptr)
+#endif  /* defined(_WIN32) && !defined(__MINGW32__) */
 
 int get_value(int i, int rshift) {
   int v;
@@ -156,16 +170,26 @@ void init_buffer(void *src, int size, int rshift) {
 }
 
 
-void do_bench(char *compressor, int nthreads, int size, int elsize,
+void do_bench(char *compressor, char *shuffle, int nthreads, int size, int elsize,
               int rshift, FILE * ofile) {
   void *src, *srccpy;
   void *dest[NCHUNKS], *dest2;
   int nbytes = 0, cbytes = 0;
-  int i, j;
-  struct timeval last, current;
-  float tmemcpy, tshuf, tunshuf;
-  int clevel, doshuffle=1;
+  int i, j, retcode;
   unsigned char *orig, *round;
+  blosc_timestamp_t last, current;
+  double tmemcpy, tshuf, tunshuf;
+  int clevel, doshuffle;
+
+  if (strcmp(shuffle, "shuffle") == 0) {
+      doshuffle = BLOSC_SHUFFLE;
+    }
+  else if (strcmp(shuffle, "bitshuffle") == 0) {
+      doshuffle = BLOSC_BITSHUFFLE;
+    }
+  else if (strcmp(shuffle, "noshuffle") == 0) {
+      doshuffle = BLOSC_NOSHUFFLE;
+    }
 
   blosc_set_nthreads(nthreads);
   if(blosc_set_compressor(compressor) < 0){
@@ -175,23 +199,19 @@ void do_bench(char *compressor, int nthreads, int size, int elsize,
   }
 
   /* Initialize buffers */
-  src = malloc(size);
   srccpy = malloc(size);
-  dest2 = malloc(size);
+  retcode = posix_memalign( (void **)(&src), 32, size);
+  retcode = posix_memalign( (void **)(&dest2), 32, size);
+
   /* zero src to initialize byte on it, and not only multiples of 4 */
   memset(src, 0, size);
   init_buffer(src, size, rshift);
   memcpy(srccpy, src, size);
   for (j = 0; j < nchunks; j++) {
-    dest[j] = malloc(size+BLOSC_MAX_OVERHEAD);
+     retcode = posix_memalign( (void **)(&dest[j]), 32, size+BLOSC_MAX_OVERHEAD);
   }
 
-  /* Warm destination memory (memcpy() will go a bit faster later on) */
-  for (j = 0; j < nchunks; j++) {
-    memcpy(dest[j], src, size);
-  }
-
-  fprintf(ofile, "--> %d, %d, %d, %d, %s\n", nthreads, size, elsize, rshift, compressor);
+  fprintf(ofile, "--> %d, %d, %d, %d, %s, %s\n", nthreads, size, elsize, rshift, compressor, shuffle);
   fprintf(ofile, "********************** Run info ******************************\n");
   fprintf(ofile, "Blosc version: %s (%s)\n", BLOSC_VERSION_STRING, BLOSC_VERSION_DATE);
   fprintf(ofile, "Using synthetic data with %d significant bits (out of 32)\n", rshift);
@@ -200,43 +220,43 @@ void do_bench(char *compressor, int nthreads, int size, int elsize,
   fprintf(ofile, "Number of threads: %d\n", nthreads);
   fprintf(ofile, "********************** Running benchmarks *********************\n");
 
-  gettimeofday(&last, NULL);
+  blosc_set_timestamp(&last);
   for (i = 0; i < niter; i++) {
     for (j = 0; j < nchunks; j++) {
       memcpy(dest[j], src, size);
     }
   }
-  gettimeofday(&current, NULL);
-  tmemcpy = get_usec_chunk(last, current);
+  blosc_set_timestamp(&current);
+  tmemcpy = get_usec_chunk(last, current, niter, nchunks);
   fprintf(ofile, "memcpy(write):\t\t %6.1f us, %.1f MB/s\n",
-         tmemcpy, size/(tmemcpy*MB/1e6));
+         tmemcpy, (size * 1e6) / (tmemcpy*MB));
 
-  gettimeofday(&last, NULL);
+  blosc_set_timestamp(&last);
   for (i = 0; i < niter; i++) {
     for (j = 0; j < nchunks; j++) {
       memcpy(dest2, dest[j], size);
     }
   }
-  gettimeofday(&current, NULL);
-  tmemcpy = get_usec_chunk(last, current);
+  blosc_set_timestamp(&current);
+  tmemcpy = get_usec_chunk(last, current, niter, nchunks);
   fprintf(ofile, "memcpy(read):\t\t %6.1f us, %.1f MB/s\n",
-         tmemcpy, size/(tmemcpy*MB/1e6));
+         tmemcpy, (size * 1e6) / (tmemcpy*MB));
 
   for (clevel=0; clevel<10; clevel++) {
 
     fprintf(ofile, "Compression level: %d\n", clevel);
 
-    gettimeofday(&last, NULL);
+    blosc_set_timestamp(&last);
     for (i = 0; i < niter; i++) {
       for (j = 0; j < nchunks; j++) {
         cbytes = blosc_compress(clevel, doshuffle, elsize, size, src,
                                 dest[j], size+BLOSC_MAX_OVERHEAD);
       }
     }
-    gettimeofday(&current, NULL);
-    tshuf = get_usec_chunk(last, current);
+    blosc_set_timestamp(&current);
+    tshuf = get_usec_chunk(last, current, niter, nchunks);
     fprintf(ofile, "comp(write):\t %6.1f us, %.1f MB/s\t  ",
-           tshuf, size/(tshuf*MB/1e6));
+           tshuf, (size * 1e6) / (tshuf*MB));
     fprintf(ofile, "Final bytes: %d  ", cbytes);
     if (cbytes > 0) {
       fprintf(ofile, "Ratio: %3.2f", size/(float)cbytes);
@@ -250,7 +270,7 @@ void do_bench(char *compressor, int nthreads, int size, int elsize,
       }
     }
 
-    gettimeofday(&last, NULL);
+    blosc_set_timestamp(&last);
     for (i = 0; i < niter; i++) {
       for (j = 0; j < nchunks; j++) {
         if (cbytes == 0) {
@@ -262,26 +282,33 @@ void do_bench(char *compressor, int nthreads, int size, int elsize,
         }
       }
     }
-    gettimeofday(&current, NULL);
-    tunshuf = get_usec_chunk(last, current);
+    blosc_set_timestamp(&current);
+    tunshuf = get_usec_chunk(last, current, niter, nchunks);
     fprintf(ofile, "decomp(read):\t %6.1f us, %.1f MB/s\t  ",
-           tunshuf, nbytes/(tunshuf*MB/1e6));
+           tunshuf, (nbytes * 1e6) / (tunshuf*MB));
     if (nbytes < 0) {
       fprintf(ofile, "FAILED.  Error code: %d\n", nbytes);
     }
     /* fprintf(ofile, "Orig bytes: %d\tFinal bytes: %d\n", cbytes, nbytes); */
 
-    /* Check if data has had a good roundtrip */
+    /* Check if data has had a good roundtrip.
+       Byte-by-byte comparison is slow, so use 'memcmp' to check whether the
+       roundtripped data is correct. If not, fall back to the slow path to
+       print diagnostic messages. */
     orig = (unsigned char *)srccpy;
     round = (unsigned char *)dest2;
-    for(i = 0; i<size; ++i){
-      if (orig[i] != round[i]) {
-        fprintf(ofile, "\nError: Original data and round-trip do not match in pos %d\n",
-               (int)i);
-        fprintf(ofile, "Orig--> %x, round-trip--> %x\n", orig[i], round[i]);
-        break;
+    if (memcmp(orig, round, size) != 0)
+    {
+      for(i = 0; i<size; ++i){
+        if (orig[i] != round[i]) {
+          fprintf(ofile, "\nError: Original data and round-trip do not match in pos %d\n",
+                 (int)i);
+          fprintf(ofile, "Orig--> %x, round-trip--> %x\n", orig[i], round[i]);
+          break;
+        }
       }
     }
+    else { i = size; }
 
     if (i == size) fprintf(ofile, "OK\n");
 
@@ -292,9 +319,9 @@ void do_bench(char *compressor, int nthreads, int size, int elsize,
      compression levels */
   totalsize += (size * nchunks * niter * 10.);
 
-  free(src); free(srccpy); free(dest2);
+  aligned_free(src); free(srccpy); aligned_free(dest2);
   for (i = 0; i < nchunks; i++) {
-    free(dest[i]);
+    aligned_free(dest[i]);
   }
 
 }
@@ -310,14 +337,6 @@ int get_nchunks(int size_, int ws) {
   return nchunks;
 }
 
-void *bench_wrap(void * args)
-{
-    struct bench_wrap_args * arg = (struct bench_wrap_args *) args;
-    do_bench(arg->compressor, arg->nthreads, arg->size, arg->elsize,
-             arg->rshift, arg->output_file);
-    return 0;
-}
-
 void print_compress_info(void)
 {
   char *name = NULL, *version = NULL;
@@ -343,6 +362,7 @@ void print_compress_info(void)
 
 int main(int argc, char *argv[]) {
   char compressor[32];
+  char shuffle[32] = "shuffle";
   char bsuite[32];
   int single = 1;
   int suite = 0;
@@ -356,15 +376,16 @@ int main(int argc, char *argv[]) {
   int workingset = 256*MB;              /* The maximum allocated memory */
   int nthreads_, size_, elsize_, rshift_, i;
   FILE * output_file = stdout;
-  struct timeval last, current;
+  blosc_timestamp_t last, current;
   float totaltime;
   char usage[256];
 
   print_compress_info();
 
   strncpy(usage, "Usage: bench [blosclz | lz4 | lz4hc | snappy | zlib] "
-          "[[single | suite | hardsuite | extremesuite | debugsuite] "
-          "[nthreads [bufsize(bytes) [typesize [sbits ]]]]]", 255);
+	  "[noshuffle | shuffle | bitshuffle] "
+          "[single | suite | hardsuite | extremesuite | debugsuite] "
+          "[nthreads] [bufsize(bytes)] [typesize] [sbits]", 255);
 
   if (argc < 2) {
     printf("%s\n", usage);
@@ -379,17 +400,33 @@ int main(int argc, char *argv[]) {
       strcmp(compressor, "snappy") != 0 &&
       strcmp(compressor, "zlib") != 0) {
     printf("No such compressor: '%s'\n", compressor);
+    printf("%s\n", usage);
     exit(2);
   }
 
-  if (argc < 3)
+  if (argc >= 3) {
+      strcpy(shuffle, argv[2]);
+      if (strcmp(shuffle, "shuffle") != 0 &&
+          strcmp(shuffle, "bitshuffle") != 0 &&
+          strcmp(shuffle, "noshuffle") != 0) {
+	printf("No such shuffler: '%s'\n", shuffle);
+	printf("%s\n", usage);
+	exit(2);
+     }
+  }
+
+  if (argc < 4)
     strcpy(bsuite, "single");
   else
-    strcpy(bsuite, argv[2]);
+    strcpy(bsuite, argv[3]);
 
   if (strcmp(bsuite, "single") == 0) {
     single = 1;
   }
+  else if (strcmp(bsuite, "test") == 0) {
+    single = 1;
+    workingset = 128*MB;
+  }
   else if (strcmp(bsuite, "suite") == 0) {
     suite = 1;
   }
@@ -429,34 +466,35 @@ int main(int argc, char *argv[]) {
   }
 
   printf("Using compressor: %s\n", compressor);
+  printf("Using shuffle type: %s\n", shuffle);
   printf("Running suite: %s\n", bsuite);
 
-  if (argc >= 4) {
-    nthreads = atoi(argv[3]);
-  }
   if (argc >= 5) {
-    size = atoi(argv[4]);
+    nthreads = atoi(argv[4]);
   }
   if (argc >= 6) {
-    elsize = atoi(argv[5]);
+    size = atoi(argv[5]);
   }
   if (argc >= 7) {
-    rshift = atoi(argv[6]);
+    elsize = atoi(argv[6]);
+  }
+  if (argc >= 8) {
+    rshift = atoi(argv[7]);
   }
 
-  if ((argc >= 8) || !(single || suite || hard_suite || extreme_suite)) {
+  if ((argc >= 9) || !(single || suite || hard_suite || extreme_suite)) {
     printf("%s\n", usage);
     exit(1);
   }
 
   nchunks = get_nchunks(size, workingset);
-  gettimeofday(&last, NULL);
+  blosc_set_timestamp(&last);
 
   blosc_init();
 
   if (suite) {
     for (nthreads_=1; nthreads_ <= nthreads; nthreads_++) {
-      do_bench(compressor, nthreads_, size, elsize, rshift, output_file);
+      do_bench(compressor, shuffle, nthreads_, size, elsize, rshift, output_file);
     }
   }
   else if (hard_suite) {
@@ -471,9 +509,9 @@ int main(int argc, char *argv[]) {
             nchunks = get_nchunks(size_+i, workingset);
     	    niter = 1;
             for (nthreads_ = 1; nthreads_ <= nthreads; nthreads_++) {
-              do_bench(compressor, nthreads_, size_+i, elsize_, rshift_, output_file);
-              gettimeofday(&current, NULL);
-              totaltime = getseconds(last, current);
+              do_bench(compressor, shuffle, nthreads_, size_+i, elsize_, rshift_, output_file);
+              blosc_set_timestamp(&current);
+              totaltime = (float)getseconds(last, current);
               printf("Elapsed time:\t %6.1f s.  Processed data: %.1f GB\n",
                      totaltime, totalsize / GB);
             }
@@ -490,9 +528,9 @@ int main(int argc, char *argv[]) {
           for (size_ = 32*KB; size_ <= size; size_ *= 2) {
             nchunks = get_nchunks(size_+i, workingset);
             for (nthreads_ = 1; nthreads_ <= nthreads; nthreads_++) {
-              do_bench(compressor, nthreads_, size_+i, elsize_, rshift_, output_file);
-              gettimeofday(&current, NULL);
-              totaltime = getseconds(last, current);
+              do_bench(compressor, shuffle, nthreads_, size_+i, elsize_, rshift_, output_file);
+              blosc_set_timestamp(&current);
+              totaltime = (float)getseconds(last, current);
               printf("Elapsed time:\t %6.1f s.  Processed data: %.1f GB\n",
                      totaltime, totalsize / GB);
             }
@@ -509,9 +547,9 @@ int main(int argc, char *argv[]) {
           for (size_ = size; size_ <= 16*MB; size_ *= 2) {
             nchunks = get_nchunks(size_+i, workingset);
             for (nthreads_ = nthreads; nthreads_ <= 6; nthreads_++) {
-              do_bench(compressor, nthreads_, size_+i, elsize_, rshift_, output_file);
-              gettimeofday(&current, NULL);
-              totaltime = getseconds(last, current);
+              do_bench(compressor, shuffle, nthreads_, size_+i, elsize_, rshift_, output_file);
+              blosc_set_timestamp(&current);
+              totaltime = (float)getseconds(last, current);
               printf("Elapsed time:\t %6.1f s.  Processed data: %.1f GB\n",
                      totaltime, totalsize / GB);
             }
@@ -522,12 +560,12 @@ int main(int argc, char *argv[]) {
   }
   /* Single mode */
   else {
-    do_bench(compressor, nthreads, size, elsize, rshift, output_file);
+    do_bench(compressor, shuffle, nthreads, size, elsize, rshift, output_file);
   }
 
   /* Print out some statistics */
-  gettimeofday(&current, NULL);
-  totaltime = getseconds(last, current);
+  blosc_set_timestamp(&current);
+  totaltime = (float)getseconds(last, current);
   printf("\nRound-trip compr/decompr on %.1f GB\n", totalsize / GB);
   printf("Elapsed time:\t %6.1f s, %.1f MB/s\n",
          totaltime, totalsize*2*1.1/(MB*totaltime));
diff --git a/c-blosc/bench/plot-speeds.py b/c-blosc/bench/plot-speeds.py
index e2624c684..f2dfbbae7 100644
--- a/c-blosc/bench/plot-speeds.py
+++ b/c-blosc/bench/plot-speeds.py
@@ -26,12 +26,13 @@ def get_values(filename):
     for line in f:
         if line.startswith('-->'):
             tmp = line.split('-->')[1]
-            nthreads, size, elsize, sbits, codec = [i for i in tmp.split(', ')]
+            nthreads, size, elsize, sbits, codec, shuffle = [i for i in tmp.split(', ')]
             nthreads, size, elsize, sbits = map(int, (nthreads, size, elsize, sbits))
             values["size"] = size * NCHUNKS / MB_;
             values["elsize"] = elsize;
             values["sbits"] = sbits;
             values["codec"] = codec
+            values["shuffle"] = shuffle
             # New run for nthreads
             (ratios, speedsw, speedsr) = ([], [], [])
             # Add a new entry for (ratios, speedw, speedr)
@@ -178,7 +179,7 @@ def show_plot(plots, yaxis, legends, gtitle, xmax=None):
     if options.title:
         plot_title = options.title
     else:
-        plot_title += " (%(size).1f MB, %(elsize)d bytes, %(sbits)d bits), %(codec)s" % values
+        plot_title += " (%(size).1f MB, %(elsize)d bytes, %(sbits)d bits), %(codec)s %(shuffle)s" % values
 
     gtitle = plot_title
 
diff --git a/c-blosc/blosc/CMakeLists.txt b/c-blosc/blosc/CMakeLists.txt
index dbdfb7357..b6e89c7dd 100644
--- a/c-blosc/blosc/CMakeLists.txt
+++ b/c-blosc/blosc/CMakeLists.txt
@@ -3,12 +3,17 @@ add_definitions(-DUSING_CMAKE)
 
 set(INTERNAL_LIBS ${CMAKE_SOURCE_DIR}/internal-complibs)
 
+# Hide symbols by default unless they're specifically exported.
+# This makes it easier to keep the set of exported symbols the
+# same across all compilers/platforms.
+set(CMAKE_C_VISIBILITY_PRESET hidden)
+
 # includes
 if(NOT DEACTIVATE_LZ4)
     if (LZ4_FOUND)
         include_directories( ${LZ4_INCLUDE_DIR} )
     else(LZ4_FOUND)
-        set(LZ4_LOCAL_DIR ${INTERNAL_LIBS}/lz4-r119)
+        set(LZ4_LOCAL_DIR ${INTERNAL_LIBS}/lz4-1.7.2)
         include_directories( ${LZ4_LOCAL_DIR} )
     endif(LZ4_FOUND)
 endif(NOT DEACTIVATE_LZ4)
@@ -32,7 +37,17 @@ if(NOT DEACTIVATE_ZLIB)
 endif(NOT DEACTIVATE_ZLIB)
 
 # library sources
-set(SOURCES blosc.c blosclz.c shuffle.c)
+set(SOURCES blosc.c blosclz.c shuffle-generic.c bitshuffle-generic.c)
+if(COMPILER_SUPPORT_SSE2)
+    message(STATUS "Adding run-time support for SSE2")
+    set(SOURCES ${SOURCES} shuffle-sse2.c bitshuffle-sse2.c)
+endif(COMPILER_SUPPORT_SSE2)
+if(COMPILER_SUPPORT_AVX2)
+    message(STATUS "Adding run-time support for AVX2")
+    set(SOURCES ${SOURCES} shuffle-avx2.c bitshuffle-avx2.c)
+endif(COMPILER_SUPPORT_AVX2)
+set(SOURCES ${SOURCES} shuffle.c)
+
 # library install directory
 set(lib_dir lib${LIB_SUFFIX})
 set(version_string ${BLOSC_VERSION_MAJOR}.${BLOSC_VERSION_MINOR}.${BLOSC_VERSION_PATCH})
@@ -79,7 +94,6 @@ if(NOT DEACTIVATE_ZLIB)
     endif(ZLIB_FOUND)
 endif(NOT DEACTIVATE_ZLIB)
 
-
 # targets
 add_library(blosc_shared SHARED ${SOURCES})
 set_target_properties(blosc_shared PROPERTIES OUTPUT_NAME blosc)
@@ -87,17 +101,83 @@ set_target_properties(blosc_shared PROPERTIES
         VERSION ${version_string}
         SOVERSION 1  # Change this when an ABI change happens
     )
+set_property(
+    TARGET blosc_shared
+    APPEND PROPERTY COMPILE_DEFINITIONS BLOSC_SHARED_LIBRARY)
+
+# Based on the target architecture and hardware features supported
+# by the C compiler, set hardware architecture optimization flags
+# for specific shuffle implementations.
+if(COMPILER_SUPPORT_SSE2)
+    if (MSVC)
+        # MSVC targets SSE2 by default on 64-bit configurations, but not 32-bit configurations.
+        if (${CMAKE_SIZEOF_VOID_P} EQUAL 4)
+            set_source_files_properties(shuffle-sse2.c bitshuffle-sse2.c PROPERTIES COMPILE_FLAGS "/arch:SSE2")
+        endif (${CMAKE_SIZEOF_VOID_P} EQUAL 4)
+    else (MSVC)
+        set_source_files_properties(shuffle-sse2.c bitshuffle-sse2.c PROPERTIES COMPILE_FLAGS -msse2)
+    endif (MSVC)
+
+    # Define a symbol for the shuffle-dispatch implementation
+    # so it knows SSE2 is supported even though that file is
+    # compiled without SSE2 support (for portability).
+    set_property(
+        SOURCE shuffle.c
+        APPEND PROPERTY COMPILE_DEFINITIONS SHUFFLE_SSE2_ENABLED)
+endif(COMPILER_SUPPORT_SSE2)
+if(COMPILER_SUPPORT_AVX2)
+    if (MSVC)
+        set_source_files_properties(shuffle-avx2.c bitshuffle-avx2.c PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+    else (MSVC)
+        set_source_files_properties(shuffle-avx2.c bitshuffle-avx2.c PROPERTIES COMPILE_FLAGS -mavx2)
+    endif (MSVC)
+
+    # Define a symbol for the shuffle-dispatch implementation
+    # so it knows AVX2 is supported even though that file is
+    # compiled without AVX2 support (for portability).
+    set_property(
+        SOURCE shuffle.c
+        APPEND PROPERTY COMPILE_DEFINITIONS SHUFFLE_AVX2_ENABLED)
+endif(COMPILER_SUPPORT_AVX2)
+
+# When the option has been selected to compile the test suite,
+# compile an additional version of blosc_shared which exports
+# some normally-hidden symbols (to facilitate unit testing).
+if (BUILD_TESTS)
+    add_library(blosc_shared_testing SHARED ${SOURCES})
+    set_target_properties(blosc_shared_testing PROPERTIES OUTPUT_NAME blosc_testing)
+    set_property(
+        TARGET blosc_shared_testing
+        APPEND PROPERTY COMPILE_DEFINITIONS BLOSC_SHARED_LIBRARY)
+    set_property(
+        TARGET blosc_shared_testing
+        APPEND PROPERTY COMPILE_DEFINITIONS BLOSC_TESTING)
+    # TEMP : CMake doesn't automatically add -lpthread here like it does
+    # for the blosc_shared target. Force it for now.
+    if(UNIX)
+        set_property(
+            TARGET blosc_shared_testing
+            APPEND PROPERTY LINK_FLAGS "-lpthread")
+    endif()
+endif()
+
 target_link_libraries(blosc_shared ${LIBS})
+if (BUILD_TESTS)
+    target_link_libraries(blosc_shared_testing ${LIBS})
+endif()
 
 if(BUILD_STATIC)
     add_library(blosc_static STATIC ${SOURCES})
     set_target_properties(blosc_static PROPERTIES OUTPUT_NAME blosc)
+    if (MSVC)
+        set_target_properties(blosc_static PROPERTIES PREFIX lib)
+    endif()
     target_link_libraries(blosc_static ${LIBS})
 endif(BUILD_STATIC)
 
 
 # install
-install(FILES blosc.h DESTINATION include COMPONENT DEV)
+install(FILES blosc.h blosc-export.h DESTINATION include COMPONENT DEV)
 install(TARGETS blosc_shared DESTINATION ${lib_dir} COMPONENT LIB)
 if(BUILD_STATIC)
     install(TARGETS blosc_static DESTINATION ${lib_dir} COMPONENT DEV)
diff --git a/c-blosc/blosc/bitshuffle-avx2.c b/c-blosc/blosc/bitshuffle-avx2.c
new file mode 100644
index 000000000..91db8f7c0
--- /dev/null
+++ b/c-blosc/blosc/bitshuffle-avx2.c
@@ -0,0 +1,248 @@
+/*
+ * Bitshuffle - Filter for improving compression of typed binary data.
+ *
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * Note: Adapted for c-blosc by Francesc Alted.
+ *
+ * See LICENSES/BITSHUFFLE.txt file for details about copyright and
+ * rights to use.
+ *
+ */
+
+#include "bitshuffle-generic.h"
+#include "bitshuffle-sse2.h"
+#include "bitshuffle-avx2.h"
+
+
+/* Make sure AVX2 is available for the compilation target and compiler. */
+#if !defined(__AVX2__)
+  #error AVX2 is not supported by the target architecture/platform and/or this compiler.
+#endif
+
+#include <immintrin.h>
+
+/* The next is useful for debugging purposes */
+#if 0
+#include <stdio.h>
+#include <string.h>
+
+static void printymm(__m256i ymm0)
+{
+  uint8_t buf[32];
+
+  ((__m256i *)buf)[0] = ymm0;
+  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
+          buf[0], buf[1], buf[2], buf[3],
+          buf[4], buf[5], buf[6], buf[7],
+          buf[8], buf[9], buf[10], buf[11],
+          buf[12], buf[13], buf[14], buf[15],
+          buf[16], buf[17], buf[18], buf[19],
+          buf[20], buf[21], buf[22], buf[23],
+          buf[24], buf[25], buf[26], buf[27],
+          buf[28], buf[29], buf[30], buf[31]);
+}
+#endif
+
+
+/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
+
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_avx2(void* in, void* out, const size_t size,
+                                  const size_t elem_size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    int32_t* out_i32;
+
+    size_t nbyte = elem_size * size;
+
+    int64_t count;
+
+    __m256i ymm;
+    int32_t bt;
+    size_t ii, kk;
+
+    for (ii = 0; ii + 31 < nbyte; ii += 32) {
+        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
+        for (kk = 0; kk < 8; kk++) {
+            bt = _mm256_movemask_epi8(ymm);
+            ymm = _mm256_slli_epi16(ymm, 1);
+            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+            *out_i32 = bt;
+        }
+    }
+    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
+            nbyte - nbyte % 32);
+    return count;
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
+                                  const size_t elem_size, void* tmp_buf) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    count = bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
+    CHECK_ERR(count);
+    count = bshuf_trans_bit_byte_avx2(out, tmp_buf, size, elem_size);
+    CHECK_ERR(count);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    return count;
+}
+
+
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
+ * the bytes. */
+int64_t bshuf_trans_byte_bitrow_avx2(void* in, void* out, const size_t size,
+                                     const size_t elem_size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+
+    size_t nrows = 8 * elem_size;
+    size_t nbyte_row = size / 8;
+    size_t ii, jj, kk, hh, mm;
+
+    CHECK_MULT_EIGHT(size);
+
+    if (elem_size % 4)
+      return bshuf_trans_byte_bitrow_sse2(in, out, size, elem_size);
+
+    __m256i ymm_0[8];
+    __m256i ymm_1[8];
+    __m256i ymm_storeage[8][4];
+
+    for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
+        for (ii = 0; ii + 3 < elem_size; ii += 4) {
+            for (hh = 0; hh < 4; hh ++) {
+
+                for (kk = 0; kk < 8; kk ++){
+                    ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
+                            (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
+                }
+
+                for (kk = 0; kk < 4; kk ++){
+                    ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
+                            ymm_0[kk * 2 + 1]);
+                    ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
+                            ymm_0[kk * 2 + 1]);
+                }
+
+                for (kk = 0; kk < 2; kk ++){
+                    for (mm = 0; mm < 2; mm ++){
+                        ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
+                                ymm_1[kk * 4 + mm * 2],
+                                ymm_1[kk * 4 + mm * 2 + 1]);
+                        ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
+                                ymm_1[kk * 4 + mm * 2],
+                                ymm_1[kk * 4 + mm * 2 + 1]);
+                    }
+                }
+
+                for (kk = 0; kk < 4; kk ++){
+                    ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
+                            ymm_0[kk * 2 + 1]);
+                    ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
+                            ymm_0[kk * 2 + 1]);
+                }
+
+                for (kk = 0; kk < 8; kk ++){
+                    ymm_storeage[kk][hh] = ymm_1[kk];
+                }
+            }
+
+            for (mm = 0; mm < 8; mm ++) {
+
+                for (kk = 0; kk < 4; kk ++){
+                    ymm_0[kk] = ymm_storeage[mm][kk];
+                }
+
+                ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
+                ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
+                ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
+                ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
+
+                ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
+                ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
+                ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
+                ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
+
+                _mm256_storeu_si256((__m256i *) &out_b[
+                        (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
+                _mm256_storeu_si256((__m256i *) &out_b[
+                        (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
+                _mm256_storeu_si256((__m256i *) &out_b[
+                        (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
+                _mm256_storeu_si256((__m256i *) &out_b[
+                        (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
+            }
+        }
+    }
+    for (ii = 0; ii < nrows; ii ++ ) {
+        for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
+            out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_avx2(void* in, void* out, const size_t size,
+                                         const size_t elem_size) {
+
+    CHECK_MULT_EIGHT(size);
+
+    /*  With a bit of care, this could be written such that such that it is */
+    /*  in_buf = out_buf safe. */
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+
+    size_t nbyte = elem_size * size;
+    size_t ii, jj, kk, ind;
+
+    __m256i ymm;
+    int32_t bt;
+
+    if (elem_size % 4) {
+        return bshuf_shuffle_bit_eightelem_sse2(in, out, size, elem_size);
+    } else {
+        for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
+            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
+                    ii += 8 * elem_size) {
+                ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
+                for (kk = 0; kk < 8; kk++) {
+                    bt = _mm256_movemask_epi8(ymm);
+                    ymm = _mm256_slli_epi16(ymm, 1);
+                    ind = (ii + jj / 8 + (7 - kk) * elem_size);
+                    * (int32_t *) &out_b[ind] = bt;
+                }
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
+                                    const size_t elem_size, void* tmp_buf) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    count = bshuf_trans_byte_bitrow_avx2(in, tmp_buf, size, elem_size);
+    CHECK_ERR(count);
+    count =  bshuf_shuffle_bit_eightelem_avx2(tmp_buf, out, size, elem_size);
+
+    return count;
+}
diff --git a/c-blosc/blosc/bitshuffle-avx2.h b/c-blosc/blosc/bitshuffle-avx2.h
new file mode 100644
index 000000000..4bdd98945
--- /dev/null
+++ b/c-blosc/blosc/bitshuffle-avx2.h
@@ -0,0 +1,38 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+/* AVX2-accelerated shuffle/unshuffle routines. */
+
+#ifndef BITSHUFFLE_AVX2_H
+#define BITSHUFFLE_AVX2_H
+
+#include "shuffle-common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  AVX2-accelerated bitshuffle routine.
+*/
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
+			  const size_t elem_size, void* tmp_buf);
+
+/**
+  AVX2-accelerated bitunshuffle routine.
+*/
+BLOSC_NO_EXPORT int64_t
+bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
+			    const size_t elem_size, void* tmp_buf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BITSHUFFLE_AVX2_H */
diff --git a/c-blosc/blosc/bitshuffle-generic.c b/c-blosc/blosc/bitshuffle-generic.c
new file mode 100644
index 000000000..589803f60
--- /dev/null
+++ b/c-blosc/blosc/bitshuffle-generic.c
@@ -0,0 +1,197 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "bitshuffle-generic.h"
+
+
+/* Transpose bytes within elements, starting partway through input. */
+int64_t bshuf_trans_byte_elem_remainder(void* in, void* out, const size_t size,
+         const size_t elem_size, const size_t start) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    size_t ii, jj, kk;
+
+    CHECK_MULT_EIGHT(start);
+
+    if (size > start) {
+        /*  ii loop separated into 2 loops so the compiler can unroll */
+        /*  the inner one. */
+        for (ii = start; ii + 7 < size; ii += 8) {
+            for (jj = 0; jj < elem_size; jj++) {
+                for (kk = 0; kk < 8; kk++) {
+                    out_b[jj * size + ii + kk]
+                        = in_b[ii * elem_size + kk * elem_size + jj];
+                }
+            }
+        }
+        for (ii = size - size % 8; ii < size; ii ++) {
+            for (jj = 0; jj < elem_size; jj++) {
+                out_b[jj * size + ii] = in_b[ii * elem_size + jj];
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Transpose bytes within elements. */
+int64_t bshuf_trans_byte_elem_scal(void* in, void* out, const size_t size,
+				   const size_t elem_size) {
+
+    return bshuf_trans_byte_elem_remainder(in, out, size, elem_size, 0);
+}
+
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_remainder(void* in, void* out, const size_t size,
+         const size_t elem_size, const size_t start_byte) {
+
+    int64_t* in_b = in;
+    int8_t* out_b = out;
+
+    int64_t x, t;
+
+    size_t nbyte = elem_size * size;
+    size_t nbyte_bitrow = nbyte / 8;
+    size_t ii;
+    int kk;
+
+    CHECK_MULT_EIGHT(nbyte);
+    CHECK_MULT_EIGHT(start_byte);
+
+    for (ii = start_byte / 8; ii < nbyte_bitrow; ii ++) {
+        x = in_b[ii];
+        TRANS_BIT_8X8(x, t);
+        for (kk = 0; kk < 8; kk ++) {
+            out_b[kk * nbyte_bitrow + ii] = x;
+            x = x >> 8;
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_scal(void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    return bshuf_trans_bit_byte_remainder(in, out, size, elem_size, 0);
+}
+
+
+/* General transpose of an array, optimized for large element sizes. */
+int64_t bshuf_trans_elem(void* in, void* out, const size_t lda,
+        const size_t ldb, const size_t elem_size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    size_t ii, jj;
+    for (ii = 0; ii < lda; ii++) {
+        for (jj = 0; jj < ldb; jj++) {
+            memcpy(&out_b[(jj*lda + ii) * elem_size],
+                   &in_b[(ii*ldb + jj) * elem_size], elem_size);
+        }
+    }
+    return lda * ldb * elem_size;
+}
+
+
+/* Transpose rows of shuffled bits (size / 8 bytes) within groups of 8. */
+int64_t bshuf_trans_bitrow_eight(void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t nbyte_bitrow = size / 8;
+
+    CHECK_MULT_EIGHT(size);
+
+    return bshuf_trans_elem(in, out, 8, elem_size, nbyte_bitrow);
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_scal(void* in, void* out, const size_t size,
+                                  const size_t elem_size, void* tmp_buf) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
+    CHECK_ERR(count);
+    count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size);
+    CHECK_ERR(count);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    return count;
+}
+
+
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
+ * the bytes. */
+int64_t bshuf_trans_byte_bitrow_scal(void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+
+    size_t nbyte_row = size / 8;
+    size_t ii, jj, kk;
+
+    CHECK_MULT_EIGHT(size);
+
+    for (jj = 0; jj < elem_size; jj++) {
+        for (ii = 0; ii < nbyte_row; ii++) {
+            for (kk = 0; kk < 8; kk++) {
+                out_b[ii * 8 * elem_size + jj * 8 + kk] = \
+                        in_b[(jj * 8 + kk) * nbyte_row + ii];
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_scal(void* in, void* out,
+        const size_t size, const size_t elem_size) {
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    size_t nbyte = elem_size * size;
+    int64_t x, t;
+    size_t jj, ii, kk;
+
+    CHECK_MULT_EIGHT(size);
+
+    for (jj = 0; jj < 8 * elem_size; jj += 8) {
+        for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) {
+            x = *((int64_t*) &in_b[ii + jj]);
+            TRANS_BIT_8X8(x, t);
+            for (kk = 0; kk < 8; kk++) {
+                *((uint8_t*) &out_b[ii + jj / 8 + kk * elem_size]) = x;
+                x = x >> 8;
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_scal(void* in, void* out, const size_t size,
+                                    const size_t elem_size, void* tmp_buf) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size);
+    CHECK_ERR(count);
+    count =  bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size);
+
+    return count;
+}
diff --git a/c-blosc/blosc/bitshuffle-generic.h b/c-blosc/blosc/bitshuffle-generic.h
new file mode 100644
index 000000000..03b3f5547
--- /dev/null
+++ b/c-blosc/blosc/bitshuffle-generic.h
@@ -0,0 +1,151 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+/* Generic (non-hardware-accelerated) shuffle/unshuffle routines.
+   These are used when hardware-accelerated functions aren't available
+   for a particular platform; they are also used by the hardware-
+   accelerated functions to handle any remaining elements in a block
+   which isn't a multiple of the hardware's vector size. */
+
+#ifndef BITSHUFFLE_GENERIC_H
+#define BITSHUFFLE_GENERIC_H
+
+#include "shuffle-common.h"
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*  Macros. */
+#define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+#define CHECK_ERR(count) if (count < 0) { return count; }
+
+
+/* ---- Worker code not requiring special instruction sets. ----
+ *
+ * The following code does not use any x86 specific vectorized instructions
+ * and should compile on any machine
+ *
+ */
+
+/* Transpose 8x8 bit array packed into a single quadword *x*.
+ * *t* is workspace. */
+#define TRANS_BIT_8X8(x, t) {                                               \
+        t = (x ^ (x >> 7)) & 0x00AA00AA00AA00AALL;                          \
+        x = x ^ t ^ (t << 7);                                               \
+        t = (x ^ (x >> 14)) & 0x0000CCCC0000CCCCLL;                         \
+        x = x ^ t ^ (t << 14);                                              \
+        t = (x ^ (x >> 28)) & 0x00000000F0F0F0F0LL;                         \
+        x = x ^ t ^ (t << 28);                                              \
+    }
+
+
+/* Transpose of an array of arbitrarily typed elements. */
+#define TRANS_ELEM_TYPE(in, out, lda, ldb, type_t) {                        \
+        type_t* in_type = (type_t*) in;                                     \
+        type_t* out_type = (type_t*) out;                                   \
+        size_t ii, jj, kk;                                                  \
+        for (ii = 0; ii + 7 < lda; ii += 8) {                               \
+            for (jj = 0; jj < ldb; jj++) {                                  \
+                for (kk = 0; kk < 8; kk++) {                                \
+                    out_type[jj*lda + ii + kk] =                            \
+                        in_type[ii*ldb + kk * ldb + jj];                    \
+                }                                                           \
+            }                                                               \
+        }                                                                   \
+        for (ii = lda - lda % 8; ii < lda; ii ++) {                         \
+            for (jj = 0; jj < ldb; jj++) {                                  \
+                out_type[jj*lda + ii] = in_type[ii*ldb + jj];               \
+            }                                                               \
+        }                                                                   \
+    }
+
+
+/* Private functions */
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_byte_elem_remainder(void* in, void* out, const size_t size,
+                                const size_t elem_size, const size_t start);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_byte_elem_scal(void* in, void* out, const size_t size,
+                           const size_t elem_size);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_bit_byte_remainder(void* in, void* out, const size_t size,
+                               const size_t elem_size, const size_t start_byte);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_elem(void* in, void* out, const size_t lda,
+                 const size_t ldb, const size_t elem_size);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_bitrow_eight(void* in, void* out, const size_t size,
+                         const size_t elem_size);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_shuffle_bit_eightelem_scal(void* in, void* out,
+                                 const size_t size, const size_t elem_size);
+
+
+/* Bitshuffle the data.
+ *
+ * Transpose the bits within elements.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer, must be of size * elem_size bytes
+ *  out : output buffer, must be of size * elem_size bytes
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  tmp_buffer : temporary buffer with the same `size` than `in` and `out`
+ *
+ * Returns
+ * -------
+ *  nothing -- this cannot fail
+ *
+ */
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_bit_elem_scal(void* in, void* out, const size_t size,
+                          const size_t elem_size, void* tmp_buf);
+
+/* Unshuffle bitshuffled data.
+ *
+ * Untranspose the bits within elements.
+ *
+ * To properly unshuffle bitshuffled data, *size* and *elem_size* must
+ * match the parameters used to shuffle the data.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer, must be of size * elem_size bytes
+ *  out : output buffer, must be of size * elem_size bytes
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  tmp_buffer : temporary buffer with the same `size` than `in` and `out`
+ *
+ * Returns
+ * -------
+ *  nothing -- this cannot fail
+ *
+ */
+
+BLOSC_NO_EXPORT int64_t
+bshuf_untrans_bit_elem_scal(void* in, void* out, const size_t size,
+                            const size_t elem_size, void* tmp_buf);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BITSHUFFLE_GENERIC_H */
diff --git a/c-blosc/blosc/bitshuffle-sse2.c b/c-blosc/blosc/bitshuffle-sse2.c
new file mode 100644
index 000000000..8191ca2b5
--- /dev/null
+++ b/c-blosc/blosc/bitshuffle-sse2.c
@@ -0,0 +1,467 @@
+/*
+ * Bitshuffle - Filter for improving compression of typed binary data.
+ *
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * Note: Adapted for c-blosc by Francesc Alted.
+ *
+ * See LICENSES/BITSHUFFLE.txt file for details about copyright and
+ * rights to use.
+ *
+ */
+
+#include "bitshuffle-generic.h"
+#include "bitshuffle-sse2.h"
+
+/* Make sure SSE2 is available for the compilation target and compiler. */
+#if !defined(__SSE2__)
+  #error SSE2 is not supported by the target architecture/platform and/or this compiler.
+#endif
+
+#include <emmintrin.h>
+
+/* The next is useful for debugging purposes */
+#if 0
+#include <stdio.h>
+#include <string.h>
+
+
+static void printxmm(__m128i xmm0)
+{
+  uint8_t buf[32];
+
+  ((__m128i *)buf)[0] = xmm0;
+  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
+          buf[0], buf[1], buf[2], buf[3],
+          buf[4], buf[5], buf[6], buf[7],
+          buf[8], buf[9], buf[10], buf[11],
+          buf[12], buf[13], buf[14], buf[15]);
+}
+#endif
+
+
+/* ---- Worker code that requires SSE2. Intel Petium 4 (2000) and later. ---- */
+
+/* Transpose bytes within elements for 16 bit elements. */
+int64_t bshuf_trans_byte_elem_SSE_16(void* in, void* out, const size_t size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    __m128i a0, b0, a1, b1;
+    size_t ii;
+
+    for (ii=0; ii + 15 < size; ii += 16) {
+        a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]);
+        b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+
+        a0 = _mm_unpacklo_epi8(a1, b1);
+        b0 = _mm_unpackhi_epi8(a1, b1);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+
+        a0 = _mm_unpacklo_epi8(a1, b1);
+        b0 = _mm_unpackhi_epi8(a1, b1);
+
+        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
+        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
+    }
+    return bshuf_trans_byte_elem_remainder(in, out, size, 2,
+            size - size % 16);
+}
+
+
+/* Transpose bytes within elements for 32 bit elements. */
+int64_t bshuf_trans_byte_elem_SSE_32(void* in, void* out, const size_t size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    __m128i a0, b0, c0, d0, a1, b1, c1, d1;
+    size_t ii;
+
+    for (ii=0; ii + 15 < size; ii += 16) {
+        a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]);
+        b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]);
+        c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]);
+        d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+        c1 = _mm_unpacklo_epi8(c0, d0);
+        d1 = _mm_unpackhi_epi8(c0, d0);
+
+        a0 = _mm_unpacklo_epi8(a1, b1);
+        b0 = _mm_unpackhi_epi8(a1, b1);
+        c0 = _mm_unpacklo_epi8(c1, d1);
+        d0 = _mm_unpackhi_epi8(c1, d1);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+        c1 = _mm_unpacklo_epi8(c0, d0);
+        d1 = _mm_unpackhi_epi8(c0, d0);
+
+        a0 = _mm_unpacklo_epi64(a1, c1);
+        b0 = _mm_unpackhi_epi64(a1, c1);
+        c0 = _mm_unpacklo_epi64(b1, d1);
+        d0 = _mm_unpackhi_epi64(b1, d1);
+
+        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
+        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
+        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
+        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
+    }
+    return bshuf_trans_byte_elem_remainder(in, out, size, 4,
+            size - size % 16);
+}
+
+
+/* Transpose bytes within elements for 64 bit elements. */
+int64_t bshuf_trans_byte_elem_SSE_64(void* in, void* out, const size_t size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
+    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
+    size_t ii;
+
+    for (ii=0; ii + 15 < size; ii += 16) {
+        a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
+        b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
+        c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
+        d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
+        e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
+        f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
+        g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
+        h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+        c1 = _mm_unpacklo_epi8(c0, d0);
+        d1 = _mm_unpackhi_epi8(c0, d0);
+        e1 = _mm_unpacklo_epi8(e0, f0);
+        f1 = _mm_unpackhi_epi8(e0, f0);
+        g1 = _mm_unpacklo_epi8(g0, h0);
+        h1 = _mm_unpackhi_epi8(g0, h0);
+
+        a0 = _mm_unpacklo_epi8(a1, b1);
+        b0 = _mm_unpackhi_epi8(a1, b1);
+        c0 = _mm_unpacklo_epi8(c1, d1);
+        d0 = _mm_unpackhi_epi8(c1, d1);
+        e0 = _mm_unpacklo_epi8(e1, f1);
+        f0 = _mm_unpackhi_epi8(e1, f1);
+        g0 = _mm_unpacklo_epi8(g1, h1);
+        h0 = _mm_unpackhi_epi8(g1, h1);
+
+        a1 = _mm_unpacklo_epi32(a0, c0);
+        b1 = _mm_unpackhi_epi32(a0, c0);
+        c1 = _mm_unpacklo_epi32(b0, d0);
+        d1 = _mm_unpackhi_epi32(b0, d0);
+        e1 = _mm_unpacklo_epi32(e0, g0);
+        f1 = _mm_unpackhi_epi32(e0, g0);
+        g1 = _mm_unpacklo_epi32(f0, h0);
+        h1 = _mm_unpackhi_epi32(f0, h0);
+
+        a0 = _mm_unpacklo_epi64(a1, e1);
+        b0 = _mm_unpackhi_epi64(a1, e1);
+        c0 = _mm_unpacklo_epi64(b1, f1);
+        d0 = _mm_unpackhi_epi64(b1, f1);
+        e0 = _mm_unpacklo_epi64(c1, g1);
+        f0 = _mm_unpackhi_epi64(c1, g1);
+        g0 = _mm_unpacklo_epi64(d1, h1);
+        h0 = _mm_unpackhi_epi64(d1, h1);
+
+        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
+        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
+        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
+        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
+        _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
+        _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
+        _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
+        _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
+    }
+    return bshuf_trans_byte_elem_remainder(in, out, size, 8,
+            size - size % 16);
+}
+
+
+/* Memory copy with bshuf call signature. */
+int64_t bshuf_copy(void* in, void* out, const size_t size,
+                   const size_t elem_size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+
+    memcpy(out_b, in_b, size * elem_size);
+    return size * elem_size;
+}
+
+
+/* Transpose bytes within elements using best SSE algorithm available. */
+int64_t bshuf_trans_byte_elem_sse2(void* in, void* out, const size_t size,
+                                   const size_t elem_size, void* tmp_buf) {
+
+    int64_t count;
+
+    /*  Trivial cases: power of 2 bytes. */
+    switch (elem_size) {
+        case 1:
+            count = bshuf_copy(in, out, size, elem_size);
+            return count;
+        case 2:
+            count = bshuf_trans_byte_elem_SSE_16(in, out, size);
+            return count;
+        case 4:
+            count = bshuf_trans_byte_elem_SSE_32(in, out, size);
+            return count;
+        case 8:
+            count = bshuf_trans_byte_elem_SSE_64(in, out, size);
+            return count;
+    }
+
+    /*  Worst case: odd number of bytes. Turns out that this is faster for */
+    /*  (odd * 2) byte elements as well (hence % 4). */
+    if (elem_size % 4) {
+        count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
+        return count;
+    }
+
+    /*  Multiple of power of 2: transpose hierarchically. */
+    {
+        size_t nchunk_elem;
+
+        if ((elem_size % 8) == 0) {
+            nchunk_elem = elem_size / 8;
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
+            count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf,
+                    size * nchunk_elem);
+            bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
+        } else if ((elem_size % 4) == 0) {
+            nchunk_elem = elem_size / 4;
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
+            count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf,
+                    size * nchunk_elem);
+            bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
+        } else {
+            /*  Not used since scalar algorithm is faster. */
+            nchunk_elem = elem_size / 2;
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
+            count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
+                    size * nchunk_elem);
+            bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
+        }
+
+        return count;
+    }
+}
+
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_sse2(void* in, void* out, const size_t size,
+				  const size_t elem_size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    uint16_t* out_ui16;
+    int64_t count;
+    size_t nbyte = elem_size * size;
+    __m128i xmm;
+    int32_t bt;
+    size_t ii, kk;
+
+    CHECK_MULT_EIGHT(nbyte);
+
+    for (ii = 0; ii + 15 < nbyte; ii += 16) {
+        xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
+        for (kk = 0; kk < 8; kk++) {
+            bt = _mm_movemask_epi8(xmm);
+            xmm = _mm_slli_epi16(xmm, 1);
+            out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+            *out_ui16 = bt;
+        }
+    }
+    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
+            nbyte - nbyte % 16);
+    return count;
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_sse2(void* in, void* out, const size_t size,
+				  const size_t elem_size, void* tmp_buf) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    count = bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
+    CHECK_ERR(count);
+    count = bshuf_trans_bit_byte_sse2(out, tmp_buf, size, elem_size);
+    CHECK_ERR(count);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    return count;
+}
+
+
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
+ * the bytes. */
+int64_t bshuf_trans_byte_bitrow_sse2(void* in, void* out, const size_t size,
+				     const size_t elem_size) {
+
+    char* in_b = (char*) in;
+    char* out_b = (char*) out;
+    size_t nrows = 8 * elem_size;
+    size_t nbyte_row = size / 8;
+    size_t ii, jj;
+
+    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
+    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
+    __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
+
+    CHECK_MULT_EIGHT(size);
+
+    for (ii = 0; ii + 7 < nrows; ii += 8) {
+        for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
+            a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]);
+            b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]);
+            c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]);
+            d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]);
+            e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]);
+            f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]);
+            g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]);
+            h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]);
+
+
+            a1 = _mm_unpacklo_epi8(a0, b0);
+            b1 = _mm_unpacklo_epi8(c0, d0);
+            c1 = _mm_unpacklo_epi8(e0, f0);
+            d1 = _mm_unpacklo_epi8(g0, h0);
+            e1 = _mm_unpackhi_epi8(a0, b0);
+            f1 = _mm_unpackhi_epi8(c0, d0);
+            g1 = _mm_unpackhi_epi8(e0, f0);
+            h1 = _mm_unpackhi_epi8(g0, h0);
+
+
+            a0 = _mm_unpacklo_epi16(a1, b1);
+            b0 = _mm_unpacklo_epi16(c1, d1);
+            c0 = _mm_unpackhi_epi16(a1, b1);
+            d0 = _mm_unpackhi_epi16(c1, d1);
+
+            e0 = _mm_unpacklo_epi16(e1, f1);
+            f0 = _mm_unpacklo_epi16(g1, h1);
+            g0 = _mm_unpackhi_epi16(e1, f1);
+            h0 = _mm_unpackhi_epi16(g1, h1);
+
+
+            a1 = _mm_unpacklo_epi32(a0, b0);
+            b1 = _mm_unpackhi_epi32(a0, b0);
+
+            c1 = _mm_unpacklo_epi32(c0, d0);
+            d1 = _mm_unpackhi_epi32(c0, d0);
+
+            e1 = _mm_unpacklo_epi32(e0, f0);
+            f1 = _mm_unpackhi_epi32(e0, f0);
+
+            g1 = _mm_unpacklo_epi32(g0, h0);
+            h1 = _mm_unpackhi_epi32(g0, h0);
+
+            /*  We don't have a storeh instruction for integers, so interpret */
+            /*  as a float. Have a storel (_mm_storel_epi64). */
+            as = (__m128 *) &a1;
+            bs = (__m128 *) &b1;
+            cs = (__m128 *) &c1;
+            ds = (__m128 *) &d1;
+            es = (__m128 *) &e1;
+            fs = (__m128 *) &f1;
+            gs = (__m128 *) &g1;
+            hs = (__m128 *) &h1;
+
+            _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs);
+
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs);
+        }
+        for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
+            out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size,
+					 const size_t elem_size) {
+    /*  With a bit of care, this could be written such that such that it is */
+    /*  in_buf = out_buf safe. */
+    char* in_b = (char*) in;
+    uint16_t* out_ui16 = (uint16_t*) out;
+
+    size_t nbyte = elem_size * size;
+
+    __m128i xmm;
+    int32_t bt;
+    size_t ii, jj, kk;
+    size_t ind;
+
+    CHECK_MULT_EIGHT(size);
+
+    if (elem_size % 2) {
+        bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
+    } else {
+        for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
+                ii += 8 * elem_size) {
+            for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
+                xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
+                for (kk = 0; kk < 8; kk++) {
+                    bt = _mm_movemask_epi8(xmm);
+                    xmm = _mm_slli_epi16(xmm, 1);
+                    ind = (ii + jj / 8 + (7 - kk) * elem_size);
+                    out_ui16[ind / 2] = bt;
+                }
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_sse2(void* in, void* out, const size_t size,
+				    const size_t elem_size, void* tmp_buf) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    count = bshuf_trans_byte_bitrow_sse2(in, tmp_buf, size, elem_size);
+    CHECK_ERR(count);
+    count = bshuf_shuffle_bit_eightelem_sse2(tmp_buf, out, size, elem_size);
+
+    return count;
+}
diff --git a/c-blosc/blosc/bitshuffle-sse2.h b/c-blosc/blosc/bitshuffle-sse2.h
new file mode 100644
index 000000000..703728141
--- /dev/null
+++ b/c-blosc/blosc/bitshuffle-sse2.h
@@ -0,0 +1,52 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+/* SSE2-accelerated shuffle/unshuffle routines. */
+
+#ifndef BITSHUFFLE_SSE2_H
+#define BITSHUFFLE_SSE2_H
+
+#include "shuffle-common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_byte_elem_sse2(void* in, void* out, const size_t size,
+                           const size_t elem_size, void* tmp_buf);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_byte_bitrow_sse2(void* in, void* out, const size_t size,
+                             const size_t elem_size);
+
+BLOSC_NO_EXPORT int64_t
+bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size,
+                                 const size_t elem_size);
+
+/**
+  SSE2-accelerated bitshuffle routine.
+*/
+BLOSC_NO_EXPORT int64_t
+bshuf_trans_bit_elem_sse2(void* in, void* out, const size_t size,
+                          const size_t elem_size, void* tmp_buf);
+
+/**
+  SSE2-accelerated bitunshuffle routine.
+*/
+BLOSC_NO_EXPORT int64_t
+bshuf_untrans_bit_elem_sse2(void* in, void* out, const size_t size,
+                            const size_t elem_size, void* tmp_buf);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* BITSHUFFLE_SSE2_H */
diff --git a/c-blosc/blosc/blosc-export.h b/c-blosc/blosc/blosc-export.h
new file mode 100644
index 000000000..49df9296b
--- /dev/null
+++ b/c-blosc/blosc/blosc-export.h
@@ -0,0 +1,45 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+#ifndef BLOSC_EXPORT_H
+#define BLOSC_EXPORT_H
+
+/* Macros for specifying exported symbols.
+   BLOSC_EXPORT is used to decorate symbols that should be
+   exported by the blosc shared library.
+   BLOSC_NO_EXPORT is used to decorate symbols that should NOT
+   be exported by the blosc shared library.
+*/
+#if defined(BLOSC_SHARED_LIBRARY)
+  #if defined(_MSC_VER)
+    #define BLOSC_EXPORT __declspec(dllexport)
+  #elif (defined(__GNUC__) && __GNUC__ >= 4) || defined(__clang__)
+    #if defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__)
+      #define BLOSC_EXPORT __attribute__((dllexport))
+    #else
+      #define BLOSC_EXPORT __attribute__((visibility("default")))
+    #endif  /* defined(_WIN32) || defined(__CYGWIN__) */
+  #else
+    #error Cannot determine how to define BLOSC_EXPORT for this compiler.
+  #endif
+#else
+  #define BLOSC_EXPORT
+#endif  /* defined(BLOSC_SHARED_LIBRARY) */
+
+#if defined(__GNUC__) || defined(__clang__)
+  #define BLOSC_NO_EXPORT __attribute__((visibility("hidden")))
+#else
+  #define BLOSC_NO_EXPORT
+#endif  /* defined(__GNUC__) || defined(__clang__) */
+
+/* When testing, export everything to make it easier to implement tests. */
+#if defined(BLOSC_TESTING)
+  #undef BLOSC_NO_EXPORT
+  #define BLOSC_NO_EXPORT BLOSC_EXPORT
+#endif  /* defined(BLOSC_TESTING) */
+
+#endif  /* BLOSC_EXPORT_H */
diff --git a/c-blosc/blosc/blosc.c b/c-blosc/blosc/blosc.c
index f19905f9c..9e1b9645e 100644
--- a/c-blosc/blosc/blosc.c
+++ b/c-blosc/blosc/blosc.c
@@ -1,14 +1,13 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <francesc@blosc.io>
+  Author: Francesc Alted <francesc@blosc.org>
   Creation date: 2009-05-20
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
 
 
-#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <sys/types.h>
@@ -33,7 +32,15 @@
 
 #if defined(_WIN32) && !defined(__MINGW32__)
   #include <windows.h>
-  #include "win32/stdint-windows.h"
+  #include <malloc.h>
+
+  /* stdint.h only available in VS2010 (VC++ 16.0) and newer */
+  #if defined(_MSC_VER) && _MSC_VER < 1600
+    #include "win32/stdint-windows.h"
+  #else
+    #include <stdint.h>
+  #endif
+
   #include <process.h>
   #define getpid _getpid
 #else
@@ -42,13 +49,18 @@
   #include <inttypes.h>
 #endif  /* _WIN32 */
 
-#if defined(_WIN32)
+#if defined(_WIN32) && !defined(__GNUC__)
   #include "win32/pthread.h"
   #include "win32/pthread.c"
 #else
   #include <pthread.h>
 #endif
 
+/* If C11 is supported, use it's built-in aligned allocation. */
+#if __STDC_VERSION__ >= 201112L
+  #include <stdalign.h>
+#endif
+
 
 /* Some useful units */
 #define KB 1024
@@ -63,120 +75,123 @@
 /* The size of L1 cache.  32 KB is quite common nowadays. */
 #define L1 (32*KB)
 
-/* Wrapped function to adjust the number of threads used by blosc */
-int blosc_set_nthreads_(int);
-
-/* Global variables for main logic */
-static int32_t init_temps_done = 0;    /* temp for compr/decompr initialized? */
-static int32_t force_blocksize = 0;    /* force the use of a blocksize? */
-static int pid = 0;                    /* the PID for this process */
-static int init_lib = 0;               /* is library initalized? */
-
-/* Global variables for threads */
-static int32_t nthreads = 1;              /* number of desired threads in pool */
-static int32_t compressor = BLOSC_BLOSCLZ;  /* the compressor to use by default */
-static int32_t init_threads_done = 0;     /* pool of threads initialized? */
-static int32_t end_threads = 0;           /* should exisiting threads end? */
-static int32_t init_sentinels_done = 0;   /* sentinels initialized? */
-static int32_t giveup_code;               /* error code when give up */
-static int32_t nblock;                    /* block counter */
-static pthread_t threads[BLOSC_MAX_THREADS];  /* opaque structure for threads */
-static int32_t tids[BLOSC_MAX_THREADS];       /* ID per each thread */
-#if !defined(_WIN32)
-static pthread_attr_t ct_attr;            /* creation time attrs for threads */
-#endif
-
 /* Have problems using posix barriers when symbol value is 200112L */
 /* This requires more investigation, but will work for the moment */
 #if defined(_POSIX_BARRIERS) && ( (_POSIX_BARRIERS - 20012L) >= 0 && _POSIX_BARRIERS != 200112L)
 #define _POSIX_BARRIERS_MINE
 #endif
-
 /* Synchronization variables */
-static pthread_mutex_t count_mutex;
-static pthread_mutex_t global_comp_mutex;
-#ifdef _POSIX_BARRIERS_MINE
-static pthread_barrier_t barr_init;
-static pthread_barrier_t barr_finish;
-#else
-static int32_t count_threads;
-static pthread_mutex_t count_threads_mutex;
-static pthread_cond_t count_threads_cv;
-#endif
 
 
-/* Structure for parameters in (de-)compression threads */
-static struct thread_data {
-  int32_t typesize;
-  int32_t blocksize;
-  int32_t compress;
-  int32_t clevel;
-  int32_t flags;
-  int32_t memcpyed;
-  int32_t ntbytes;
-  int32_t nbytes;
-  int32_t maxbytes;
-  int32_t nblocks;
-  int32_t leftover;
-  uint8_t *bstarts;             /* start pointers for each block */
-  uint8_t *src;
-  uint8_t *dest;
-  uint8_t *tmp[BLOSC_MAX_THREADS];
-  uint8_t *tmp2[BLOSC_MAX_THREADS];
-} params;
+struct blosc_context {
+  int32_t compress;               /* 1 if we are doing compression 0 if decompress */
+
+  const uint8_t* src;
+  uint8_t* dest;                  /* The current pos in the destination buffer */
+  uint8_t* header_flags;          /* Flags for header.  Currently booked:
+                                    - 0: byte-shuffled?
+                                    - 1: memcpy'ed?
+                                    - 2: bit-shuffled? */
+  int32_t sourcesize;             /* Number of bytes in source buffer (or uncompressed bytes in compressed file) */
+  int32_t nblocks;                /* Number of total blocks in buffer */
+  int32_t leftover;               /* Extra bytes at end of buffer */
+  int32_t blocksize;              /* Length of the block in bytes */
+  int32_t typesize;               /* Type size */
+  int32_t num_output_bytes;       /* Counter for the number of output bytes */
+  int32_t destsize;               /* Maximum size for destination buffer */
+  uint8_t* bstarts;               /* Start of the buffer past header info */
+  int32_t compcode;               /* Compressor code to use */
+  int clevel;                     /* Compression level (1-9) */
+
+  /* Threading */
+  int32_t numthreads;
+  int32_t threads_started;
+  int32_t end_threads;
+  pthread_t threads[BLOSC_MAX_THREADS];
+  int32_t tids[BLOSC_MAX_THREADS];
+  pthread_mutex_t count_mutex;
+  #ifdef _POSIX_BARRIERS_MINE
+  pthread_barrier_t barr_init;
+  pthread_barrier_t barr_finish;
+  #else
+  int32_t count_threads;
+  pthread_mutex_t count_threads_mutex;
+  pthread_cond_t count_threads_cv;
+  #endif
+  #if !defined(_WIN32)
+  pthread_attr_t ct_attr;            /* creation time attrs for threads */
+  #endif
+  int32_t thread_giveup_code;               /* error code when give up */
+  int32_t thread_nblock;                    /* block counter */
+};
+
+struct thread_context {
+  struct blosc_context* parent_context;
+  int32_t tid;
+  uint8_t* tmp;
+  uint8_t* tmp2;
+  uint8_t* tmp3;
+  int32_t tmpblocksize; /* Used to keep track of how big the temporary buffers are */
+};
+
+/* Global context for non-contextual API */
+static struct blosc_context* g_global_context;
+static pthread_mutex_t global_comp_mutex;
+static int32_t g_compressor = BLOSC_BLOSCLZ;  /* the compressor to use by default */
+static int32_t g_threads = 1;
+static int32_t g_force_blocksize = 0;
+static int32_t g_initlib = 0;
 
 
-/* Structure for parameters meant for keeping track of current temporaries */
-static struct temp_data {
-  int32_t nthreads;
-  int32_t typesize;
-  int32_t blocksize;
-} current_temp;
 
+/* Wrapped function to adjust the number of threads used by blosc */
+int blosc_set_nthreads_(struct blosc_context*);
+
+/* Releases the global threadpool */
+int blosc_release_threadpool(struct blosc_context* context);
 
 /* Macros for synchronization */
 
 /* Wait until all threads are initialized */
 #ifdef _POSIX_BARRIERS_MINE
-static int rc;
-#define WAIT_INIT(RET_VAL)  \
-  rc = pthread_barrier_wait(&barr_init); \
+#define WAIT_INIT(RET_VAL, CONTEXT_PTR)  \
+  rc = pthread_barrier_wait(&CONTEXT_PTR->barr_init); \
   if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { \
-    printf("Could not wait on barrier (init)\n"); \
-    return((RET_VAL));				  \
+    printf("Could not wait on barrier (init): %d\n", rc); \
+    return((RET_VAL));                            \
   }
 #else
-#define WAIT_INIT(RET_VAL)   \
-  pthread_mutex_lock(&count_threads_mutex); \
-  if (count_threads < nthreads) { \
-    count_threads++; \
-    pthread_cond_wait(&count_threads_cv, &count_threads_mutex); \
+#define WAIT_INIT(RET_VAL, CONTEXT_PTR)   \
+  pthread_mutex_lock(&CONTEXT_PTR->count_threads_mutex); \
+  if (CONTEXT_PTR->count_threads < CONTEXT_PTR->numthreads) { \
+    CONTEXT_PTR->count_threads++;  \
+    pthread_cond_wait(&CONTEXT_PTR->count_threads_cv, &CONTEXT_PTR->count_threads_mutex); \
   } \
   else { \
-    pthread_cond_broadcast(&count_threads_cv); \
+    pthread_cond_broadcast(&CONTEXT_PTR->count_threads_cv); \
   } \
-  pthread_mutex_unlock(&count_threads_mutex);
+  pthread_mutex_unlock(&CONTEXT_PTR->count_threads_mutex);
 #endif
 
 /* Wait for all threads to finish */
 #ifdef _POSIX_BARRIERS_MINE
-#define WAIT_FINISH(RET_VAL)   \
-  rc = pthread_barrier_wait(&barr_finish); \
+#define WAIT_FINISH(RET_VAL, CONTEXT_PTR)   \
+  rc = pthread_barrier_wait(&CONTEXT_PTR->barr_finish); \
   if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { \
     printf("Could not wait on barrier (finish)\n"); \
-    return((RET_VAL));				    \
+    return((RET_VAL));                              \
   }
 #else
-#define WAIT_FINISH(RET_VAL)			    \
-  pthread_mutex_lock(&count_threads_mutex); \
-  if (count_threads > 0) { \
-    count_threads--; \
-    pthread_cond_wait(&count_threads_cv, &count_threads_mutex); \
+#define WAIT_FINISH(RET_VAL, CONTEXT_PTR)                           \
+  pthread_mutex_lock(&CONTEXT_PTR->count_threads_mutex); \
+  if (CONTEXT_PTR->count_threads > 0) { \
+    CONTEXT_PTR->count_threads--; \
+    pthread_cond_wait(&CONTEXT_PTR->count_threads_cv, &CONTEXT_PTR->count_threads_mutex); \
   } \
   else { \
-    pthread_cond_broadcast(&count_threads_cv); \
+    pthread_cond_broadcast(&CONTEXT_PTR->count_threads_cv); \
   } \
-  pthread_mutex_unlock(&count_threads_mutex);
+  pthread_mutex_unlock(&CONTEXT_PTR->count_threads_mutex);
 #endif
 
 
@@ -186,15 +201,19 @@ static uint8_t *my_malloc(size_t size)
   void *block = NULL;
   int res = 0;
 
-#if defined(_WIN32)
+/* Do an alignment to 32 bytes because AVX2 is supported */
+#if _ISOC11_SOURCE
+  /* C11 aligned allocation. 'size' must be a multiple of the alignment. */
+  block = aligned_alloc(32, size);
+#elif defined(_WIN32)
   /* A (void *) cast needed for avoiding a warning with MINGW :-/ */
-  block = (void *)_aligned_malloc(size, 16);
+  block = (void *)_aligned_malloc(size, 32);
 #elif defined __APPLE__
   /* Mac OS X guarantees 16-byte alignment in small allocs */
   block = malloc(size);
 #elif _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600
   /* Platform does have an implementation of posix_memalign */
-  res = posix_memalign(&block, 16, size);
+  res = posix_memalign(&block, 32, size);
 #else
   block = malloc(size);
 #endif  /* _WIN32 */
@@ -220,7 +239,7 @@ static void my_free(void *block)
 
 
 /* Copy 4 bytes from `*pa` to int32_t, changing endianness if necessary. */
-static int32_t sw32_(uint8_t *pa)
+static int32_t sw32_(const uint8_t *pa)
 {
   int32_t idest;
   uint8_t *dest = (uint8_t *)&idest;
@@ -296,7 +315,7 @@ static char *clibcode_to_clibname(int clibcode)
   if (clibcode == BLOSC_LZ4_LIB) return BLOSC_LZ4_LIBNAME;
   if (clibcode == BLOSC_SNAPPY_LIB) return BLOSC_SNAPPY_LIBNAME;
   if (clibcode == BLOSC_ZLIB_LIB) return BLOSC_ZLIB_LIBNAME;
-  return NULL;			/* should never happen */
+  return NULL;                  /* should never happen */
 }
 
 
@@ -378,11 +397,11 @@ return code;
 
 #if defined(HAVE_LZ4)
 static int lz4_wrap_compress(const char* input, size_t input_length,
-                             char* output, size_t maxout)
+                             char* output, size_t maxout, int accel)
 {
   int cbytes;
-  cbytes = LZ4_compress_limitedOutput(input, output, (int)input_length,
-                                      (int)maxout);
+  cbytes = LZ4_compress_fast(input, output, (int)input_length, (int)maxout,
+                             accel);
   return cbytes;
 }
 
@@ -394,7 +413,7 @@ static int lz4hc_wrap_compress(const char* input, size_t input_length,
     return -1;   /* input larger than 1 GB is not supported */
   /* clevel for lz4hc goes up to 16, at least in LZ4 1.1.3 */
   cbytes = LZ4_compressHC2_limitedOutput(input, output, (int)input_length,
-					 (int)maxout, clevel*2-1);
+                                         (int)maxout, clevel*2-1);
   return cbytes;
 }
 
@@ -446,7 +465,7 @@ static int zlib_wrap_compress(const char* input, size_t input_length,
   int status;
   uLongf cl = maxout;
   status = compress2(
-	     (Bytef*)output, &cl, (Bytef*)input, (uLong)input_length, clevel);
+             (Bytef*)output, &cl, (Bytef*)input, (uLong)input_length, clevel);
   if (status != Z_OK){
     return 0;
   }
@@ -468,28 +487,64 @@ static int zlib_wrap_decompress(const char* input, size_t compressed_length,
 
 #endif /*  HAVE_ZLIB */
 
+/* Compute acceleration for blosclz */
+static int get_accel(const struct blosc_context* context) {
+  int32_t clevel = context->clevel;
+  int32_t typesize = context->typesize;
+
+  if (clevel == 9) {
+    return 1;
+  }
+  if (context->compcode == BLOSC_BLOSCLZ) {
+    /* Compute the power of 2. See:
+     * http://www.exploringbinary.com/ten-ways-to-check-if-an-integer-is-a-power-of-two-in-c/
+     */
+    int32_t tspow2 = ((typesize != 0) && !(typesize & (typesize - 1)));
+    if (tspow2 && typesize < 32) {
+      return 32;
+    }
+  }
+  else if (context->compcode == BLOSC_LZ4) {
+    /* This acceleration setting based on discussions held in:
+     * https://groups.google.com/forum/#!topic/lz4c/zosy90P8MQw
+     */
+    return (10 - clevel);
+  }
+  return 1;
+}
+
 /* Shuffle & compress a single block */
-static int blosc_c(int32_t blocksize, int32_t leftoverblock,
-                   int32_t ntbytes, int32_t maxbytes,
-                   uint8_t *src, uint8_t *dest, uint8_t *tmp)
+static int blosc_c(const struct blosc_context* context, int32_t blocksize,
+                   int32_t leftoverblock, int32_t ntbytes, int32_t maxbytes,
+                   const uint8_t *src, uint8_t *dest, uint8_t *tmp,
+                   uint8_t *tmp2)
 {
   int32_t j, neblock, nsplits;
   int32_t cbytes;                   /* number of compressed bytes in split */
   int32_t ctbytes = 0;              /* number of compressed bytes in block */
   int32_t maxout;
-  int32_t typesize = params.typesize;
-  uint8_t *_tmp;
+  int32_t typesize = context->typesize;
+  const uint8_t *_tmp = src;
   char *compname;
+  int accel;
+  int bscount;
 
-  if ((params.flags & BLOSC_DOSHUFFLE) && (typesize > 1)) {
-    /* Shuffle this block (this makes sense only if typesize > 1) */
+  if (*(context->header_flags) & BLOSC_DOSHUFFLE) {
+    /* Byte shuffling only makes sense if typesize > 1 */
     shuffle(typesize, blocksize, src, tmp);
     _tmp = tmp;
   }
-  else {
-    _tmp = src;
+  /* We don't allow more than 1 filter at the same time (yet) */
+  else if (*(context->header_flags) & BLOSC_DOBITSHUFFLE) {
+    bscount = bitshuffle(typesize, blocksize, src, tmp, tmp2);
+    if (bscount < 0)
+      return bscount;
+    _tmp = tmp;
   }
 
+  /* Calculate acceleration for different compressors */
+  accel = get_accel(context);
+
   /* Compress for each shuffled slice split for this block. */
   /* If typesize is too large, neblock is too small or we are in a
      leftover block, do not split at all. */
@@ -507,7 +562,7 @@ static int blosc_c(int32_t blocksize, int32_t leftoverblock,
     ctbytes += (int32_t)sizeof(int32_t);
     maxout = neblock;
     #if defined(HAVE_SNAPPY)
-    if (compressor == BLOSC_SNAPPY) {
+    if (context->compcode == BLOSC_SNAPPY) {
       /* TODO perhaps refactor this to keep the value stashed somewhere */
       maxout = snappy_max_compressed_length(neblock);
     }
@@ -518,35 +573,35 @@ static int blosc_c(int32_t blocksize, int32_t leftoverblock,
         return 0;                  /* non-compressible block */
       }
     }
-    if (compressor == BLOSC_BLOSCLZ) {
-      cbytes = blosclz_compress(params.clevel, _tmp+j*neblock, neblock,
-                                dest, maxout);
+    if (context->compcode == BLOSC_BLOSCLZ) {
+      cbytes = blosclz_compress(context->clevel, _tmp+j*neblock, neblock,
+                                dest, maxout, accel);
     }
     #if defined(HAVE_LZ4)
-    else if (compressor == BLOSC_LZ4) {
+    else if (context->compcode == BLOSC_LZ4) {
       cbytes = lz4_wrap_compress((char *)_tmp+j*neblock, (size_t)neblock,
-                                 (char *)dest, (size_t)maxout);
+                                 (char *)dest, (size_t)maxout, accel);
     }
-    else if (compressor == BLOSC_LZ4HC) {
+    else if (context->compcode == BLOSC_LZ4HC) {
       cbytes = lz4hc_wrap_compress((char *)_tmp+j*neblock, (size_t)neblock,
-                                   (char *)dest, (size_t)maxout, params.clevel);
+                                   (char *)dest, (size_t)maxout, context->clevel);
     }
     #endif /*  HAVE_LZ4 */
     #if defined(HAVE_SNAPPY)
-    else if (compressor == BLOSC_SNAPPY) {
+    else if (context->compcode == BLOSC_SNAPPY) {
       cbytes = snappy_wrap_compress((char *)_tmp+j*neblock, (size_t)neblock,
                                     (char *)dest, (size_t)maxout);
     }
     #endif /*  HAVE_SNAPPY */
     #if defined(HAVE_ZLIB)
-    else if (compressor == BLOSC_ZLIB) {
+    else if (context->compcode == BLOSC_ZLIB) {
       cbytes = zlib_wrap_compress((char *)_tmp+j*neblock, (size_t)neblock,
-                                  (char *)dest, (size_t)maxout, params.clevel);
+                                  (char *)dest, (size_t)maxout, context->clevel);
     }
     #endif /*  HAVE_ZLIB */
 
     else {
-      blosc_compcode_to_compname(compressor, &compname);
+      blosc_compcode_to_compname(context->compcode, &compname);
       fprintf(stderr, "Blosc has not been compiled with '%s' ", compname);
       fprintf(stderr, "compression support.  Please use one having it.");
       return -5;    /* signals no compression support */
@@ -560,7 +615,7 @@ static int blosc_c(int32_t blocksize, int32_t leftoverblock,
       /* cbytes should never be negative */
       return -2;
     }
-    else if (cbytes == 0) {
+    else if (cbytes == 0 || cbytes == neblock) {
       /* The compressor has been unable to compress data at all. */
       /* Before doing the copy, check that we are not running into a
          buffer overflow. */
@@ -580,27 +635,26 @@ static int blosc_c(int32_t blocksize, int32_t leftoverblock,
 }
 
 /* Decompress & unshuffle a single block */
-static int blosc_d(int32_t blocksize, int32_t leftoverblock,
-                   uint8_t *src, uint8_t *dest, uint8_t *tmp, uint8_t *tmp2)
+static int blosc_d(struct blosc_context* context, int32_t blocksize, int32_t leftoverblock,
+                   const uint8_t *src, uint8_t *dest, uint8_t *tmp, uint8_t *tmp2)
 {
   int32_t j, neblock, nsplits;
   int32_t nbytes;                /* number of decompressed bytes in split */
   int32_t cbytes;                /* number of compressed bytes in split */
   int32_t ctbytes = 0;           /* number of compressed bytes in block */
   int32_t ntbytes = 0;           /* number of uncompressed bytes in block */
-  uint8_t *_tmp;
-  int32_t typesize = params.typesize;
-  int compressor_format;
+  uint8_t *_tmp = dest;
+  int32_t typesize = context->typesize;
+  int32_t compformat;
   char *compname;
+  int bscount;
 
-  if ((params.flags & BLOSC_DOSHUFFLE) && (typesize > 1)) {
+  if ((*(context->header_flags) & BLOSC_DOSHUFFLE) || \
+      (*(context->header_flags) & BLOSC_DOBITSHUFFLE)) {
     _tmp = tmp;
   }
-  else {
-    _tmp = dest;
-  }
 
-  compressor_format = (params.flags & 0xe0) >> 5;
+  compformat = (*(context->header_flags) & 0xe0) >> 5;
 
   /* Compress for each shuffled slice split for this block. */
   if ((typesize <= MAX_SPLITS) && (blocksize/typesize) >= MIN_BUFFERSIZE &&
@@ -621,29 +675,29 @@ static int blosc_d(int32_t blocksize, int32_t leftoverblock,
       nbytes = neblock;
     }
     else {
-      if (compressor_format == BLOSC_BLOSCLZ_FORMAT) {
+      if (compformat == BLOSC_BLOSCLZ_FORMAT) {
         nbytes = blosclz_decompress(src, cbytes, _tmp, neblock);
       }
       #if defined(HAVE_LZ4)
-      else if (compressor_format == BLOSC_LZ4_FORMAT) {
+      else if (compformat == BLOSC_LZ4_FORMAT) {
         nbytes = lz4_wrap_decompress((char *)src, (size_t)cbytes,
                                      (char*)_tmp, (size_t)neblock);
       }
       #endif /*  HAVE_LZ4 */
       #if defined(HAVE_SNAPPY)
-      else if (compressor_format == BLOSC_SNAPPY_FORMAT) {
+      else if (compformat == BLOSC_SNAPPY_FORMAT) {
         nbytes = snappy_wrap_decompress((char *)src, (size_t)cbytes,
                                         (char*)_tmp, (size_t)neblock);
       }
       #endif /*  HAVE_SNAPPY */
       #if defined(HAVE_ZLIB)
-      else if (compressor_format == BLOSC_ZLIB_FORMAT) {
+      else if (compformat == BLOSC_ZLIB_FORMAT) {
         nbytes = zlib_wrap_decompress((char *)src, (size_t)cbytes,
                                       (char*)_tmp, (size_t)neblock);
       }
       #endif /*  HAVE_ZLIB */
       else {
-        blosc_compcode_to_compname(compressor_format, &compname);
+        compname = clibcode_to_clibname(compformat);
         fprintf(stderr,
                 "Blosc has not been compiled with decompression "
                 "support for '%s' format. ", compname);
@@ -653,7 +707,7 @@ static int blosc_d(int32_t blocksize, int32_t leftoverblock,
 
       /* Check that decompressed bytes number is correct */
       if (nbytes != neblock) {
-	return -2;
+          return -2;
       }
 
     }
@@ -663,19 +717,13 @@ static int blosc_d(int32_t blocksize, int32_t leftoverblock,
     ntbytes += nbytes;
   } /* Closes j < nsplits */
 
-  if ((params.flags & BLOSC_DOSHUFFLE) && (typesize > 1)) {
-    if ((uintptr_t)dest % 16 == 0) {
-      /* 16-bytes aligned dest.  SSE2 unshuffle will work. */
-      unshuffle(typesize, blocksize, tmp, dest);
-    }
-    else {
-      /* dest is not aligned.  Use tmp2, which is aligned, and copy. */
-      unshuffle(typesize, blocksize, tmp, tmp2);
-      if (tmp2 != dest) {
-        /* Copy only when dest is not tmp2 (e.g. not blosc_getitem())  */
-        memcpy(dest, tmp2, blocksize);
-      }
-    }
+  if (*(context->header_flags) & BLOSC_DOSHUFFLE) {
+    unshuffle(typesize, blocksize, tmp, dest);
+  }
+  else if (*(context->header_flags) & BLOSC_DOBITSHUFFLE) {
+    bscount = bitunshuffle(typesize, blocksize, tmp, dest, tmp2);
+    if (bscount < 0)
+      return bscount;
   }
 
   /* Return the number of uncompressed bytes */
@@ -684,43 +732,40 @@ static int blosc_d(int32_t blocksize, int32_t leftoverblock,
 
 
 /* Serial version for compression/decompression */
-static int serial_blosc(void)
+static int serial_blosc(struct blosc_context* context)
 {
   int32_t j, bsize, leftoverblock;
   int32_t cbytes;
-  int32_t compress = params.compress;
-  int32_t blocksize = params.blocksize;
-  int32_t ntbytes = params.ntbytes;
-  int32_t flags = params.flags;
-  int32_t maxbytes = params.maxbytes;
-  int32_t nblocks = params.nblocks;
-  int32_t leftover = params.nbytes % params.blocksize;
-  uint8_t *bstarts = params.bstarts;
-  uint8_t *src = params.src;
-  uint8_t *dest = params.dest;
-  uint8_t *tmp = params.tmp[0];     /* tmp for thread 0 */
-  uint8_t *tmp2 = params.tmp2[0];   /* tmp2 for thread 0 */
 
-  for (j = 0; j < nblocks; j++) {
-    if (compress && !(flags & BLOSC_MEMCPYED)) {
-      _sw32(bstarts + j * 4, ntbytes);
+  int32_t ebsize = context->blocksize + context->typesize * (int32_t)sizeof(int32_t);
+  int32_t ntbytes = context->num_output_bytes;
+
+  uint8_t *tmp = my_malloc(context->blocksize + ebsize);
+  uint8_t *tmp2 = tmp + context->blocksize;
+
+  for (j = 0; j < context->nblocks; j++) {
+    if (context->compress && !(*(context->header_flags) & BLOSC_MEMCPYED)) {
+      _sw32(context->bstarts + j * 4, ntbytes);
     }
-    bsize = blocksize;
+    bsize = context->blocksize;
     leftoverblock = 0;
-    if ((j == nblocks - 1) && (leftover > 0)) {
-      bsize = leftover;
+    if ((j == context->nblocks - 1) && (context->leftover > 0)) {
+      bsize = context->leftover;
       leftoverblock = 1;
     }
-    if (compress) {
-      if (flags & BLOSC_MEMCPYED) {
+    if (context->compress) {
+      if (*(context->header_flags) & BLOSC_MEMCPYED) {
         /* We want to memcpy only */
-        memcpy(dest+BLOSC_MAX_OVERHEAD+j*blocksize, src+j*blocksize, bsize);
+        memcpy(context->dest+BLOSC_MAX_OVERHEAD+j*context->blocksize,
+                context->src+j*context->blocksize,
+                bsize);
         cbytes = bsize;
       }
       else {
         /* Regular compression */
-        cbytes = blosc_c(bsize, leftoverblock, ntbytes, maxbytes,
-                         src+j*blocksize, dest+ntbytes, tmp);
+        cbytes = blosc_c(context, bsize, leftoverblock, ntbytes,
+                         context->destsize, context->src+j*context->blocksize,
+                         context->dest+ntbytes, tmp, tmp2);
         if (cbytes == 0) {
           ntbytes = 0;              /* uncompressible data */
           break;
@@ -728,16 +773,18 @@ static int serial_blosc(void)
       }
     }
     else {
-      if (flags & BLOSC_MEMCPYED) {
+      if (*(context->header_flags) & BLOSC_MEMCPYED) {
         /* We want to memcpy only */
-        memcpy(dest+j*blocksize, src+BLOSC_MAX_OVERHEAD+j*blocksize, bsize);
+        memcpy(context->dest+j*context->blocksize,
+                context->src+BLOSC_MAX_OVERHEAD+j*context->blocksize,
+                bsize);
         cbytes = bsize;
       }
       else {
         /* Regular decompression */
-        cbytes = blosc_d(bsize, leftoverblock,
-                         src + sw32_(bstarts + j * 4),
-                         dest+j*blocksize, tmp, tmp2);
+        cbytes = blosc_d(context, bsize, leftoverblock,
+                          context->src + sw32_(context->bstarts + j * 4),
+                          context->dest+j*context->blocksize, tmp, tmp2);
       }
     }
     if (cbytes < 0) {
@@ -747,125 +794,62 @@ static int serial_blosc(void)
     ntbytes += cbytes;
   }
 
+  // Free temporaries
+  my_free(tmp);
+
   return ntbytes;
 }
 
 
 /* Threaded version for compression/decompression */
-static int parallel_blosc(void)
+static int parallel_blosc(struct blosc_context* context)
 {
+  int rc;
 
   /* Check whether we need to restart threads */
-  if (!init_threads_done || pid != getpid()) {
-    blosc_set_nthreads_(nthreads);
-  }
+  blosc_set_nthreads_(context);
+
+  /* Set sentinels */
+  context->thread_giveup_code = 1;
+  context->thread_nblock = -1;
 
   /* Synchronization point for all threads (wait for initialization) */
-  WAIT_INIT(-1);
+  WAIT_INIT(-1, context);
+
   /* Synchronization point for all threads (wait for finalization) */
-  WAIT_FINISH(-1);
+  WAIT_FINISH(-1, context);
 
-  if (giveup_code > 0) {
+  if (context->thread_giveup_code > 0) {
     /* Return the total bytes (de-)compressed in threads */
-    return params.ntbytes;
+    return context->num_output_bytes;
   }
   else {
     /* Compression/decompression gave up.  Return error code. */
-    return giveup_code;
+    return context->thread_giveup_code;
   }
 }
 
 
-/* Convenience functions for creating and releasing temporaries */
-static int create_temporaries(void)
-{
-  int32_t tid, ebsize;
-  int32_t typesize = params.typesize;
-  int32_t blocksize = params.blocksize;
-
-  /* Extended blocksize for temporary destination.  Extended blocksize
-   is only useful for compression in parallel mode, but it doesn't
-   hurt serial mode either. */
-  ebsize = blocksize + typesize * (int32_t)sizeof(int32_t);
-
-  /* Create temporary area for each thread */
-  for (tid = 0; tid < nthreads; tid++) {
-    uint8_t *tmp = my_malloc(blocksize);
-    uint8_t *tmp2;
-    if (tmp == NULL) {
-      return -1;
-    }
-    params.tmp[tid] = tmp;
-    tmp2 = my_malloc(ebsize);
-    if (tmp2 == NULL) {
-      return -1;
-    }
-    params.tmp2[tid] = tmp2;
-  }
-
-  init_temps_done = 1;
-  /* Update params for current temporaries */
-  current_temp.nthreads = nthreads;
-  current_temp.typesize = typesize;
-  current_temp.blocksize = blocksize;
-  return 0;
-}
-
-
-static void release_temporaries(void)
-{
-  int32_t tid;
-
-  /* Release buffers */
-  for (tid = 0; tid < nthreads; tid++) {
-    my_free(params.tmp[tid]);
-    my_free(params.tmp2[tid]);
-  }
-
-  init_temps_done = 0;
-}
-
-
 /* Do the compression or decompression of the buffer depending on the
    global params. */
-static int do_job(void)
+static int do_job(struct blosc_context* context)
 {
   int32_t ntbytes;
 
-  /* Initialize/reset temporaries if needed */
-  if (!init_temps_done) {
-    int ret;
-    ret = create_temporaries();
-    if (ret < 0) {
-      return -1;
-    }
-  }
-  else if (current_temp.nthreads != nthreads ||
-           current_temp.typesize != params.typesize ||
-           current_temp.blocksize != params.blocksize) {
-    int ret;
-    release_temporaries();
-    ret = create_temporaries();
-    if (ret < 0) {
-      return -1;
-    }
-  }
-
   /* Run the serial version when nthreads is 1 or when the buffers are
      not much larger than blocksize */
-  if (nthreads == 1 || (params.nbytes / params.blocksize) <= 1) {
-    ntbytes = serial_blosc();
+  if (context->numthreads == 1 || (context->sourcesize / context->blocksize) <= 1) {
+    ntbytes = serial_blosc(context);
   }
   else {
-    ntbytes = parallel_blosc();
+    ntbytes = parallel_blosc(context);
   }
 
   return ntbytes;
 }
 
 
-static int32_t compute_blocksize(int32_t clevel, int32_t typesize,
-                                 int32_t nbytes)
+static int32_t compute_blocksize(struct blosc_context* context, int32_t clevel, int32_t typesize, int32_t nbytes, int32_t forced_blocksize)
 {
   int32_t blocksize;
 
@@ -876,71 +860,56 @@ static int32_t compute_blocksize(int32_t clevel, int32_t typesize,
 
   blocksize = nbytes;           /* Start by a whole buffer as blocksize */
 
-  if (force_blocksize) {
-    blocksize = force_blocksize;
-    /* Check that forced blocksize is not too small nor too large */
+  if (forced_blocksize) {
+    blocksize = forced_blocksize;
+    /* Check that forced blocksize is not too small */
     if (blocksize < MIN_BUFFERSIZE) {
       blocksize = MIN_BUFFERSIZE;
     }
   }
-  else if (nbytes >= L1 * typesize) {
-    blocksize = L1 * typesize;
+  else if (nbytes >= L1) {
+    blocksize = L1;
 
     /* For Zlib, increase the block sizes in a factor of 8 because it
        is meant for compression large blocks (it shows a big overhead
        in compressing small ones). */
-    if (compressor == BLOSC_ZLIB) {
+    if (context->compcode == BLOSC_ZLIB) {
       blocksize *= 8;
     }
 
     /* For LZ4HC, increase the block sizes in a factor of 8 because it
        is meant for compression large blocks (it shows a big overhead
        in compressing small ones). */
-    if (compressor == BLOSC_LZ4HC) {
+    if (context->compcode == BLOSC_LZ4HC) {
       blocksize *= 8;
     }
 
     if (clevel == 0) {
-      blocksize /= 16;
+      blocksize /= 4;
     }
     else if (clevel <= 3) {
-      blocksize /= 8;
+      blocksize /= 2;
     }
     else if (clevel <= 5) {
-      blocksize /= 4;
+      blocksize *= 1;
     }
     else if (clevel <= 6) {
-      blocksize /= 2;
+      blocksize *= 2;
     }
     else if (clevel < 9) {
-      blocksize *= 1;
+      blocksize *= 4;
     }
     else {
-      blocksize *= 2;
+      blocksize *= 16;
     }
   }
-  else if (nbytes > (16 * 16))  {
-      /* align to typesize to make use of vectorized shuffles */
-      if (typesize == 2) {
-          blocksize -= blocksize % (16 * 2);
-      }
-      else if (typesize == 4) {
-          blocksize -= blocksize % (16 * 4);
-      }
-      else if (typesize == 8) {
-          blocksize -= blocksize % (16 * 8);
-      }
-      else if (typesize == 16) {
-          blocksize -= blocksize % (16 * 16);
-      }
-  }
 
   /* Check that blocksize is not too large */
   if (blocksize > (int32_t)nbytes) {
     blocksize = nbytes;
   }
 
-  /* blocksize must be a multiple of the typesize */
+  /* blocksize *must absolutely* be a multiple of the typesize */
   if (blocksize > typesize) {
     blocksize = blocksize / typesize * typesize;
   }
@@ -948,37 +917,39 @@ static int32_t compute_blocksize(int32_t clevel, int32_t typesize,
   return blocksize;
 }
 
-#define BLOSC_UNLOCK_RETURN(val) \
-  return (pthread_mutex_unlock(&global_comp_mutex), val)
-
-/* The public routine for compression.  See blosc.h for docstrings. */
-int blosc_compress(int clevel, int doshuffle, size_t typesize, size_t nbytes,
-                   const void *src, void *dest, size_t destsize)
+static int initialize_context_compression(struct blosc_context* context,
+                          int clevel,
+                          int doshuffle,
+                          size_t typesize,
+                          size_t sourcesize,
+                          const void* src,
+                          void* dest,
+                          size_t destsize,
+                          int32_t compressor,
+                          int32_t blocksize,
+                          int32_t numthreads)
 {
-  uint8_t *_dest=NULL;         /* current pos for destination buffer */
-  uint8_t *flags;              /* flags for header.  Currently booked:
-                                  - 0: shuffled?
-                                  - 1: memcpy'ed? */
-  int32_t nbytes_;            /* number of bytes in source buffer */
-  int32_t nblocks;            /* number of total blocks in buffer */
-  int32_t leftover;           /* extra bytes at end of buffer */
-  int32_t blocksize;          /* length of the block in bytes */
-  int32_t ntbytes = 0;        /* the number of compressed bytes */
-  int32_t maxbytes = (int32_t)destsize;  /* maximum size for dest buffer */
-  int compressor_format = -1; /* the format for compressor */
-  uint8_t *bstarts;           /* start pointers for each block */
+  /* Set parameters */
+  context->compress = 1;
+  context->src = (const uint8_t*)src;
+  context->dest = (uint8_t *)(dest);
+  context->num_output_bytes = 0;
+  context->destsize = (int32_t)destsize;
+  context->sourcesize = sourcesize;
+  context->typesize = typesize;
+  context->compcode = compressor;
+  context->numthreads = numthreads;
+  context->end_threads = 0;
+  context->clevel = clevel;
 
   /* Check buffer size limits */
-  if (nbytes > BLOSC_MAX_BUFFERSIZE) {
+  if (sourcesize > BLOSC_MAX_BUFFERSIZE) {
     /* If buffer is too large, give up. */
     fprintf(stderr, "Input buffer size cannot exceed %d bytes\n",
             BLOSC_MAX_BUFFERSIZE);
     return -1;
   }
 
-  /* We can safely do this assignation now */
-  nbytes_ = (int32_t)nbytes;
-
   /* Compression level */
   if (clevel < 0 || clevel > 9) {
     /* If clevel not in 0..9, print an error */
@@ -987,227 +958,316 @@ int blosc_compress(int clevel, int doshuffle, size_t typesize, size_t nbytes,
   }
 
   /* Shuffle */
-  if (doshuffle != 0 && doshuffle != 1) {
-    fprintf(stderr, "`shuffle` parameter must be either 0 or 1!\n");
+  if (doshuffle != 0 && doshuffle != 1 && doshuffle != 2) {
+    fprintf(stderr, "`shuffle` parameter must be either 0, 1 or 2!\n");
     return -10;
   }
 
   /* Check typesize limits */
-  if (typesize > BLOSC_MAX_TYPESIZE) {
+  if (context->typesize > BLOSC_MAX_TYPESIZE) {
     /* If typesize is too large, treat buffer as an 1-byte stream. */
-    typesize = 1;
+    context->typesize = 1;
   }
 
   /* Get the blocksize */
-  blocksize = compute_blocksize(clevel, (int32_t)typesize, nbytes_);
+  context->blocksize = compute_blocksize(context, clevel, (int32_t)context->typesize, context->sourcesize, blocksize);
 
   /* Compute number of blocks in buffer */
-  nblocks = nbytes_ / blocksize;
-  leftover = nbytes_ % blocksize;
-  nblocks = (leftover>0)? nblocks+1: nblocks;
+  context->nblocks = context->sourcesize / context->blocksize;
+  context->leftover = context->sourcesize % context->blocksize;
+  context->nblocks = (context->leftover > 0) ? (context->nblocks + 1) : context->nblocks;
+
+  return 1;
+}
+
+static int write_compression_header(struct blosc_context* context, int clevel, int doshuffle)
+{
+  int32_t compformat;
+
+  /* Write version header for this block */
+  context->dest[0] = BLOSC_VERSION_FORMAT;              /* blosc format version */
+
+  /* Write compressor format */
+  compformat = -1;
+  switch (context->compcode)
+  {
+  case BLOSC_BLOSCLZ:
+    compformat = BLOSC_BLOSCLZ_FORMAT;
+    context->dest[1] = BLOSC_BLOSCLZ_VERSION_FORMAT; /* blosclz format version */
+    break;
+
+#if defined(HAVE_LZ4)
+  case BLOSC_LZ4:
+    compformat = BLOSC_LZ4_FORMAT;
+    context->dest[1] = BLOSC_LZ4_VERSION_FORMAT;  /* lz4 format version */
+    break;
+  case BLOSC_LZ4HC:
+    compformat = BLOSC_LZ4HC_FORMAT;
+    context->dest[1] = BLOSC_LZ4HC_VERSION_FORMAT; /* lz4hc is the same as lz4 */
+    break;
+#endif /*  HAVE_LZ4 */
+
+#if defined(HAVE_SNAPPY)
+  case BLOSC_SNAPPY:
+    compformat = BLOSC_SNAPPY_FORMAT;
+    context->dest[1] = BLOSC_SNAPPY_VERSION_FORMAT;    /* snappy format version */
+    break;
+#endif /*  HAVE_SNAPPY */
+
+#if defined(HAVE_ZLIB)
+  case BLOSC_ZLIB:
+    compformat = BLOSC_ZLIB_FORMAT;
+    context->dest[1] = BLOSC_ZLIB_VERSION_FORMAT;      /* zlib format version */
+    break;
+#endif /*  HAVE_ZLIB */
+
+  default:
+  {
+    char *compname;
+    compname = clibcode_to_clibname(compformat);
+    fprintf(stderr, "Blosc has not been compiled with '%s' ", compname);
+    fprintf(stderr, "compression support.  Please use one having it.");
+    return -5;    /* signals no compression support */
+    break;
+  }
+  }
+
+  context->header_flags = context->dest+2;                       /* flags */
+  context->dest[2] = 0;                                          /* zeroes flags */
+  context->dest[3] = (uint8_t)context->typesize;                 /* type size */
+  _sw32(context->dest + 4, context->sourcesize);                 /* size of the buffer */
+  _sw32(context->dest + 8, context->blocksize);                  /* block size */
+  context->bstarts = context->dest + 16;                         /* starts for every block */
+  context->num_output_bytes = 16 + sizeof(int32_t)*context->nblocks;  /* space for header and pointers */
 
-  _dest = (uint8_t *)(dest);
-  /* Write header for this block */
-  _dest[0] = BLOSC_VERSION_FORMAT;              /* blosc format version */
-  if (compressor == BLOSC_BLOSCLZ) {
-    compressor_format = BLOSC_BLOSCLZ_FORMAT;
-    _dest[1] = BLOSC_BLOSCLZ_VERSION_FORMAT;    /* blosclz format version */
-  }
-  #if defined(HAVE_LZ4)
-  else if (compressor == BLOSC_LZ4) {
-    compressor_format = BLOSC_LZ4_FORMAT;
-    _dest[1] = BLOSC_LZ4_VERSION_FORMAT;       /* lz4 format version */
-  }
-  else if (compressor == BLOSC_LZ4HC) {
-    compressor_format = BLOSC_LZ4_FORMAT;
-    _dest[1] = BLOSC_LZ4_VERSION_FORMAT;       /* lz4hc is the same than lz4 */
-  }
-  #endif /*  HAVE_LZ4 */
-  #if defined(HAVE_SNAPPY)
-  else if (compressor == BLOSC_SNAPPY) {
-    compressor_format = BLOSC_SNAPPY_FORMAT;
-    _dest[1] = BLOSC_SNAPPY_VERSION_FORMAT;    /* snappy format version */
-  }
-  #endif /*  HAVE_SNAPPY */
-  #if defined(HAVE_ZLIB)
-  else if (compressor == BLOSC_ZLIB) {
-    compressor_format = BLOSC_ZLIB_FORMAT;
-    _dest[1] = BLOSC_ZLIB_VERSION_FORMAT;      /* zlib format version */
-  }
-  #endif /*  HAVE_ZLIB */
-
-  flags = _dest+2;                          /* flags */
-  _dest[2] = 0;                             /* zeroes flags */
-  _dest[3] = (uint8_t)typesize;             /* type size */
-  _sw32(_dest + 4, nbytes_);                /* size of the buffer */
-  _sw32(_dest + 8, blocksize);              /* block size */
-  bstarts = _dest + 16;                     /* starts for every block */
-  ntbytes = 16 + sizeof(int32_t)*nblocks;   /* space for header and pointers */
-
-  if (clevel == 0) {
+  if (context->clevel == 0) {
     /* Compression level 0 means buffer to be memcpy'ed */
-    *flags |= BLOSC_MEMCPYED;
+    *(context->header_flags) |= BLOSC_MEMCPYED;
   }
 
-  if (nbytes_ < MIN_BUFFERSIZE) {
+  if (context->sourcesize < MIN_BUFFERSIZE) {
     /* Buffer is too small.  Try memcpy'ing. */
-    *flags |= BLOSC_MEMCPYED;
+    *(context->header_flags) |= BLOSC_MEMCPYED;
   }
 
-  if (doshuffle == 1) {
-    /* Shuffle is active */
-    *flags |= BLOSC_DOSHUFFLE;          /* bit 0 set to one in flags */
+  if (doshuffle == BLOSC_SHUFFLE) {
+    /* Byte-shuffle is active */
+    *(context->header_flags) |= BLOSC_DOSHUFFLE;     /* bit 0 set to one in flags */
   }
 
-  *flags |= compressor_format << 5;        /* compressor format start at bit 5 */
+  if (doshuffle == BLOSC_BITSHUFFLE) {
+    /* Bit-shuffle is active */
+    *(context->header_flags) |= BLOSC_DOBITSHUFFLE;  /* bit 2 set to one in flags */
+  }
 
-  /* Take global lock for the time of compression */
-  pthread_mutex_lock(&global_comp_mutex);
-  /* Populate parameters for compression routines */
-  params.compress = 1;
-  params.clevel = clevel;
-  params.flags = (int32_t)*flags;
-  params.typesize = (int32_t)typesize;
-  params.blocksize = blocksize;
-  params.ntbytes = ntbytes;
-  params.nbytes = nbytes_;
-  params.maxbytes = maxbytes;
-  params.nblocks = nblocks;
-  params.leftover = leftover;
-  params.bstarts = bstarts;
-  params.src = (uint8_t *)src;
-  params.dest = (uint8_t *)dest;
-
-  if (!(*flags & BLOSC_MEMCPYED)) {
+  *(context->header_flags) |= compformat << 5;      /* compressor format start at bit 5 */
+
+  return 1;
+}
+
+int blosc_compress_context(struct blosc_context* context)
+{
+  int32_t ntbytes = 0;
+
+  if (!(*(context->header_flags) & BLOSC_MEMCPYED)) {
     /* Do the actual compression */
-    ntbytes = do_job();
+    ntbytes = do_job(context);
     if (ntbytes < 0) {
-      BLOSC_UNLOCK_RETURN(-1);
+      return -1;
     }
-    if ((ntbytes == 0) && (nbytes_+BLOSC_MAX_OVERHEAD <= maxbytes)) {
+    if ((ntbytes == 0) && (context->sourcesize+BLOSC_MAX_OVERHEAD <= context->destsize)) {
       /* Last chance for fitting `src` buffer in `dest`.  Update flags
        and do a memcpy later on. */
-      *flags |= BLOSC_MEMCPYED;
-      params.flags |= BLOSC_MEMCPYED;
+      *(context->header_flags) |= BLOSC_MEMCPYED;
     }
   }
 
-  if (*flags & BLOSC_MEMCPYED) {
-    if (nbytes_+BLOSC_MAX_OVERHEAD > maxbytes) {
+  if (*(context->header_flags) & BLOSC_MEMCPYED) {
+    if (context->sourcesize + BLOSC_MAX_OVERHEAD > context->destsize) {
       /* We are exceeding maximum output size */
       ntbytes = 0;
     }
-    else if (((nbytes_ % L1) == 0) || (nthreads > 1)) {
+    else if (((context->sourcesize % L1) == 0) || (context->numthreads > 1)) {
       /* More effective with large buffers that are multiples of the
        cache size or multi-cores */
-      params.ntbytes = BLOSC_MAX_OVERHEAD;
-      ntbytes = do_job();
+      context->num_output_bytes = BLOSC_MAX_OVERHEAD;
+      ntbytes = do_job(context);
       if (ntbytes < 0) {
-        BLOSC_UNLOCK_RETURN(-1);
+        return -1;
       }
     }
     else {
-      memcpy((uint8_t *)dest+BLOSC_MAX_OVERHEAD, src, nbytes_);
-      ntbytes = nbytes_ + BLOSC_MAX_OVERHEAD;
+      memcpy(context->dest+BLOSC_MAX_OVERHEAD, context->src, context->sourcesize);
+      ntbytes = context->sourcesize + BLOSC_MAX_OVERHEAD;
     }
   }
 
   /* Set the number of compressed bytes in header */
-  _sw32(_dest + 12, ntbytes);
+  _sw32(context->dest + 12, ntbytes);
 
-  /* Release global lock */
-  pthread_mutex_unlock(&global_comp_mutex);
-
-  assert(ntbytes <= maxbytes);
+  assert(ntbytes <= context->destsize);
   return ntbytes;
 }
 
+/* The public routine for compression with context. */
+int blosc_compress_ctx(int clevel, int doshuffle, size_t typesize,
+                       size_t nbytes, const void* src, void* dest,
+                       size_t destsize, const char* compressor,
+                       size_t blocksize, int numinternalthreads)
+{
+  int error, result;
+  struct blosc_context context;
 
-/* The public routine for decompression.  See blosc.h for docstrings. */
-int blosc_decompress(const void *src, void *dest, size_t destsize)
+  context.threads_started = 0;
+  error = initialize_context_compression(&context, clevel, doshuffle, typesize,
+					 nbytes, src, dest, destsize,
+					 blosc_compname_to_compcode(compressor),
+					 blocksize, numinternalthreads);
+  if (error < 0) { return error; }
+
+  error = write_compression_header(&context, clevel, doshuffle);
+  if (error < 0) { return error; }
+
+  result = blosc_compress_context(&context);
+
+  if (numinternalthreads > 1)
+  {
+    blosc_release_threadpool(&context);
+  }
+
+  return result;
+}
+
+/* The public routine for compression.  See blosc.h for docstrings. */
+int blosc_compress(int clevel, int doshuffle, size_t typesize, size_t nbytes,
+                   const void *src, void *dest, size_t destsize)
 {
-  uint8_t *_src=NULL;            /* current pos for source buffer */
-  uint8_t version, versionlz;    /* versions for compressed header */
-  uint8_t flags;                 /* flags for header */
-  int32_t ntbytes;               /* the number of uncompressed bytes */
-  int32_t nblocks;               /* number of total blocks in buffer */
-  int32_t leftover;              /* extra bytes at end of buffer */
-  int32_t typesize, blocksize, nbytes, ctbytes;
-  uint8_t *bstarts;              /* start pointers for each block */
+  int error;
+  int result;
 
-  _src = (uint8_t *)(src);
+  pthread_mutex_lock(&global_comp_mutex);
+
+  error = initialize_context_compression(g_global_context, clevel, doshuffle, typesize, nbytes,
+                                  src, dest, destsize, g_compressor, g_force_blocksize, g_threads);
+  if (error < 0) { return error; }
+
+  error = write_compression_header(g_global_context, clevel, doshuffle);
+  if (error < 0) { return error; }
+
+  result = blosc_compress_context(g_global_context);
+
+  pthread_mutex_unlock(&global_comp_mutex);
+
+  return result;
+}
+
+int blosc_run_decompression_with_context(struct blosc_context* context,
+                                         const void* src,
+                                         void* dest,
+                                         size_t destsize,
+                                         int numinternalthreads)
+{
+  uint8_t version;
+  uint8_t versionlz;
+  uint32_t ctbytes;
+  int32_t ntbytes;
+
+  context->compress = 0;
+  context->src = (const uint8_t*)src;
+  context->dest = (uint8_t*)dest;
+  context->destsize = destsize;
+  context->num_output_bytes = 0;
+  context->numthreads = numinternalthreads;
+  context->end_threads = 0;
 
   /* Read the header block */
-  version = _src[0];                        /* blosc format version */
-  versionlz = _src[1];                      /* blosclz format version */
-  flags = _src[2];                          /* flags */
-  typesize = (int32_t)_src[3];              /* typesize */
-  nbytes = sw32_(_src + 4);                 /* buffer size */
-  blocksize = sw32_(_src + 8);              /* block size */
-  ctbytes = sw32_(_src + 12);               /* compressed buffer size */
+  version = context->src[0];                        /* blosc format version */
+  versionlz = context->src[1];                      /* blosclz format version */
 
+  context->header_flags = (uint8_t*)(context->src + 2);           /* flags */
+  context->typesize = (int32_t)context->src[3];      /* typesize */
+  context->sourcesize = sw32_(context->src + 4);     /* buffer size */
+  context->blocksize = sw32_(context->src + 8);      /* block size */
+  ctbytes = sw32_(context->src + 12);               /* compressed buffer size */
+
+  /* Unused values */
   version += 0;                             /* shut up compiler warning */
   versionlz += 0;                           /* shut up compiler warning */
   ctbytes += 0;                             /* shut up compiler warning */
 
-  bstarts = _src + 16;
+  context->bstarts = (uint8_t*)(context->src + 16);
   /* Compute some params */
   /* Total blocks */
-  nblocks = nbytes / blocksize;
-  leftover = nbytes % blocksize;
-  nblocks = (leftover>0)? nblocks+1: nblocks;
+  context->nblocks = context->sourcesize / context->blocksize;
+  context->leftover = context->sourcesize % context->blocksize;
+  context->nblocks = (context->leftover>0)? context->nblocks+1: context->nblocks;
 
   /* Check that we have enough space to decompress */
-  if (nbytes > (int32_t)destsize) {
+  if (context->sourcesize > (int32_t)destsize) {
     return -1;
   }
 
-  /* Take global lock for the time of decompression */
-  pthread_mutex_lock(&global_comp_mutex);
-
-  /* Populate parameters for decompression routines */
-  params.compress = 0;
-  params.clevel = 0;            /* specific for compression */
-  params.flags = (int32_t)flags;
-  params.typesize = typesize;
-  params.blocksize = blocksize;
-  params.ntbytes = 0;
-  params.nbytes = nbytes;
-  params.nblocks = nblocks;
-  params.leftover = leftover;
-  params.bstarts = bstarts;
-  params.src = (uint8_t *)src;
-  params.dest = (uint8_t *)dest;
-
   /* Check whether this buffer is memcpy'ed */
-  if (flags & BLOSC_MEMCPYED) {
-    if (((nbytes % L1) == 0) || (nthreads > 1)) {
+  if (*(context->header_flags) & BLOSC_MEMCPYED) {
+    if (((context->sourcesize % L1) == 0) || (context->numthreads > 1)) {
       /* More effective with large buffers that are multiples of the
        cache size or multi-cores */
-      ntbytes = do_job();
+      ntbytes = do_job(context);
       if (ntbytes < 0) {
-        BLOSC_UNLOCK_RETURN(-1);
+        return -1;
       }
     }
     else {
-      memcpy(dest, (uint8_t *)src+BLOSC_MAX_OVERHEAD, nbytes);
-      ntbytes = nbytes;
+      memcpy(dest, (uint8_t *)src+BLOSC_MAX_OVERHEAD, context->sourcesize);
+      ntbytes = context->sourcesize;
     }
   }
   else {
     /* Do the actual decompression */
-    ntbytes = do_job();
+    ntbytes = do_job(context);
     if (ntbytes < 0) {
-      BLOSC_UNLOCK_RETURN(-1);
+      return -1;
     }
   }
-  /* Release global lock */
-  pthread_mutex_unlock(&global_comp_mutex);
 
   assert(ntbytes <= (int32_t)destsize);
   return ntbytes;
 }
 
+/* The public routine for decompression with context. */
+int blosc_decompress_ctx(const void *src, void *dest, size_t destsize,
+                         int numinternalthreads)
+{
+  int result;
+  struct blosc_context context;
+
+  context.threads_started = 0;
+  result = blosc_run_decompression_with_context(&context, src, dest, destsize, numinternalthreads);
+
+  if (numinternalthreads > 1)
+  {
+    blosc_release_threadpool(&context);
+  }
+
+  return result;
+}
+
+
+/* The public routine for decompression.  See blosc.h for docstrings. */
+int blosc_decompress(const void *src, void *dest, size_t destsize)
+{
+  int result;
+
+  pthread_mutex_lock(&global_comp_mutex);
+
+  result = blosc_run_decompression_with_context(g_global_context, src, dest,
+						destsize, g_threads);
+
+  pthread_mutex_unlock(&global_comp_mutex);
+
+  return result;
+}
+
 
 /* Specific routine optimized for decompression a small number of
    items out of a compressed chunk.  This does not use threads because
@@ -1221,19 +1281,18 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest)
   int32_t nblocks;                  /* number of total blocks in buffer */
   int32_t leftover;                 /* extra bytes at end of buffer */
   uint8_t *bstarts;                 /* start pointers for each block */
-  uint8_t *tmp = params.tmp[0];     /* tmp for thread 0 */
-  uint8_t *tmp2 = params.tmp2[0];   /* tmp2 for thread 0 */
   int tmp_init = 0;
   int32_t typesize, blocksize, nbytes, ctbytes;
   int32_t j, bsize, bsize2, leftoverblock;
   int32_t cbytes, startb, stopb;
   int stop = start + nitems;
+  uint8_t *tmp;
+  uint8_t *tmp2;
+  uint8_t *tmp3;
+  int32_t ebsize;
 
   _src = (uint8_t *)(src);
 
-  /* Take global lock  */
-  pthread_mutex_lock(&global_comp_mutex);
-
   /* Read the header block */
   version = _src[0];                        /* blosc format version */
   versionlz = _src[1];                      /* blosclz format version */
@@ -1243,6 +1302,11 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest)
   blocksize = sw32_(_src + 8);              /* block size */
   ctbytes = sw32_(_src + 12);               /* compressed buffer size */
 
+  ebsize = blocksize + typesize * (int32_t)sizeof(int32_t);
+  tmp = my_malloc(blocksize + ebsize + blocksize);
+  tmp2 = tmp + blocksize;
+  tmp3 = tmp + blocksize + ebsize;
+
   version += 0;                             /* shut up compiler warning */
   versionlz += 0;                           /* shut up compiler warning */
   ctbytes += 0;                             /* shut up compiler warning */
@@ -1259,29 +1323,12 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest)
   /* Check region boundaries */
   if ((start < 0) || (start*typesize > nbytes)) {
     fprintf(stderr, "`start` out of bounds");
-    BLOSC_UNLOCK_RETURN(-1);
+    return -1;
   }
 
   if ((stop < 0) || (stop*typesize > nbytes)) {
     fprintf(stderr, "`start`+`nitems` out of bounds");
-    BLOSC_UNLOCK_RETURN(-1);
-  }
-
-  /* Parameters needed by blosc_d */
-  params.typesize = typesize;
-  params.flags = flags;
-
-  /* Initialize temporaries if needed */
-  if (tmp == NULL || tmp2 == NULL || current_temp.blocksize < blocksize) {
-    tmp = my_malloc(blocksize);
-    if (tmp == NULL) {
-      BLOSC_UNLOCK_RETURN(-1);
-    }
-    tmp2 = my_malloc(blocksize);
-    if (tmp2 == NULL) {
-      BLOSC_UNLOCK_RETURN(-1);
-    }
-    tmp_init = 1;
+    return -1;
   }
 
   for (j = 0; j < nblocks; j++) {
@@ -1315,10 +1362,15 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest)
       cbytes = bsize2;
     }
     else {
+      struct blosc_context context;
+      /* blosc_d only uses typesize and flags */
+      context.typesize = typesize;
+      context.header_flags = &flags;
+
       /* Regular decompression.  Put results in tmp2. */
-      cbytes = blosc_d(bsize, leftoverblock,
+      cbytes = blosc_d(&context, bsize, leftoverblock,
                        (uint8_t *)src + sw32_(bstarts + j * 4),
-                       tmp2, tmp, tmp2);
+                       tmp2, tmp, tmp3);
       if (cbytes < 0) {
         ntbytes = cbytes;
         break;
@@ -1330,22 +1382,16 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest)
     ntbytes += cbytes;
   }
 
-  /* Release global lock */
-  pthread_mutex_unlock(&global_comp_mutex);
-
-  if (tmp_init) {
-    my_free(tmp);
-    my_free(tmp2);
-  }
+  my_free(tmp);
 
   return ntbytes;
 }
 
 
 /* Decompress & unshuffle several blocks in a single thread */
-static void *t_blosc(void *tids)
+static void *t_blosc(void *ctxt)
 {
-  int32_t tid = *(int32_t *)tids;
+  struct thread_context* context = (struct thread_context*)ctxt;
   int32_t cbytes, ntdest;
   int32_t tblocks;              /* number of blocks per thread */
   int32_t leftover2;
@@ -1362,54 +1408,55 @@ static void *t_blosc(void *tids)
   int32_t nblocks;
   int32_t leftover;
   uint8_t *bstarts;
-  uint8_t *src;
+  const uint8_t *src;
   uint8_t *dest;
   uint8_t *tmp;
   uint8_t *tmp2;
+  uint8_t *tmp3;
+  int rc;
 
-  while (1) {
-
-    init_sentinels_done = 0;     /* sentinels have to be initialised yet */
-
+  while(1)
+  {
     /* Synchronization point for all threads (wait for initialization) */
-    WAIT_INIT(NULL);
+    WAIT_INIT(NULL, context->parent_context);
 
-    /* Check if thread has been asked to return */
-    if (end_threads) {
-      return(NULL);
+    if(context->parent_context->end_threads)
+    {
+      break;
     }
 
-    pthread_mutex_lock(&count_mutex);
-    if (!init_sentinels_done) {
-      /* Set sentinels and other global variables */
-      giveup_code = 1;            /* no error code initially */
-      nblock = -1;                /* block counter */
-      init_sentinels_done = 1;    /* sentinels have been initialised */
+    /* Get parameters for this thread before entering the main loop */
+    blocksize = context->parent_context->blocksize;
+    ebsize = blocksize + context->parent_context->typesize * (int32_t)sizeof(int32_t);
+    compress = context->parent_context->compress;
+    flags = *(context->parent_context->header_flags);
+    maxbytes = context->parent_context->destsize;
+    nblocks = context->parent_context->nblocks;
+    leftover = context->parent_context->leftover;
+    bstarts = context->parent_context->bstarts;
+    src = context->parent_context->src;
+    dest = context->parent_context->dest;
+
+    if (blocksize > context->tmpblocksize)
+    {
+      my_free(context->tmp);
+      context->tmp = my_malloc(blocksize + ebsize + blocksize);
+      context->tmp2 = context->tmp + blocksize;
+      context->tmp3 = context->tmp + blocksize + ebsize;
     }
-    pthread_mutex_unlock(&count_mutex);
 
-    /* Get parameters for this thread before entering the main loop */
-    blocksize = params.blocksize;
-    ebsize = blocksize + params.typesize * (int32_t)sizeof(int32_t);
-    compress = params.compress;
-    flags = params.flags;
-    maxbytes = params.maxbytes;
-    nblocks = params.nblocks;
-    leftover = params.leftover;
-    bstarts = params.bstarts;
-    src = params.src;
-    dest = params.dest;
-    tmp = params.tmp[tid];
-    tmp2 = params.tmp2[tid];
+    tmp = context->tmp;
+    tmp2 = context->tmp2;
+    tmp3 = context->tmp3;
 
     ntbytes = 0;                /* only useful for decompression */
 
     if (compress && !(flags & BLOSC_MEMCPYED)) {
       /* Compression always has to follow the block order */
-      pthread_mutex_lock(&count_mutex);
-      nblock++;
-      nblock_ = nblock;
-      pthread_mutex_unlock(&count_mutex);
+      pthread_mutex_lock(&context->parent_context->count_mutex);
+      context->parent_context->thread_nblock++;
+      nblock_ = context->parent_context->thread_nblock;
+      pthread_mutex_unlock(&context->parent_context->count_mutex);
       tblock = nblocks;
     }
     else {
@@ -1417,11 +1464,11 @@ static void *t_blosc(void *tids)
        sequential block order on each thread */
 
       /* Blocks per thread */
-      tblocks = nblocks / nthreads;
-      leftover2 = nblocks % nthreads;
+      tblocks = nblocks / context->parent_context->numthreads;
+      leftover2 = nblocks % context->parent_context->numthreads;
       tblocks = (leftover2>0)? tblocks+1: tblocks;
 
-      nblock_ = tid*tblocks;
+      nblock_ = context->tid*tblocks;
       tblock = nblock_ + tblocks;
       if (tblock > nblocks) {
         tblock = nblocks;
@@ -1430,7 +1477,7 @@ static void *t_blosc(void *tids)
 
     /* Loop over blocks */
     leftoverblock = 0;
-    while ((nblock_ < tblock) && giveup_code > 0) {
+    while ((nblock_ < tblock) && context->parent_context->thread_giveup_code > 0) {
       bsize = blocksize;
       if (nblock_ == (nblocks - 1) && (leftover > 0)) {
         bsize = leftover;
@@ -1445,8 +1492,8 @@ static void *t_blosc(void *tids)
         }
         else {
           /* Regular compression */
-          cbytes = blosc_c(bsize, leftoverblock, 0, ebsize,
-                           src+nblock_*blocksize, tmp2, tmp);
+          cbytes = blosc_c(context->parent_context, bsize, leftoverblock, 0, ebsize,
+                           src+nblock_*blocksize, tmp2, tmp, tmp3);
         }
       }
       else {
@@ -1457,7 +1504,7 @@ static void *t_blosc(void *tids)
           cbytes = bsize;
         }
         else {
-          cbytes = blosc_d(bsize, leftoverblock,
+          cbytes = blosc_d(context->parent_context, bsize, leftoverblock,
                            src + sw32_(bstarts + nblock_ * 4),
                            dest+nblock_*blocksize,
                            tmp, tmp2);
@@ -1465,33 +1512,33 @@ static void *t_blosc(void *tids)
       }
 
       /* Check whether current thread has to giveup */
-      if (giveup_code <= 0) {
+      if (context->parent_context->thread_giveup_code <= 0) {
         break;
       }
 
       /* Check results for the compressed/decompressed block */
       if (cbytes < 0) {            /* compr/decompr failure */
         /* Set giveup_code error */
-        pthread_mutex_lock(&count_mutex);
-        giveup_code = cbytes;
-        pthread_mutex_unlock(&count_mutex);
+        pthread_mutex_lock(&context->parent_context->count_mutex);
+        context->parent_context->thread_giveup_code = cbytes;
+        pthread_mutex_unlock(&context->parent_context->count_mutex);
         break;
       }
 
       if (compress && !(flags & BLOSC_MEMCPYED)) {
         /* Start critical section */
-        pthread_mutex_lock(&count_mutex);
-        ntdest = params.ntbytes;
+        pthread_mutex_lock(&context->parent_context->count_mutex);
+        ntdest = context->parent_context->num_output_bytes;
         _sw32(bstarts + nblock_ * 4, ntdest); /* update block start counter */
         if ( (cbytes == 0) || (ntdest+cbytes > maxbytes) ) {
-          giveup_code = 0;                  /* uncompressible buffer */
-          pthread_mutex_unlock(&count_mutex);
+          context->parent_context->thread_giveup_code = 0;  /* uncompressible buffer */
+          pthread_mutex_unlock(&context->parent_context->count_mutex);
           break;
         }
-        nblock++;
-        nblock_ = nblock;
-        params.ntbytes += cbytes;           /* update return bytes counter */
-        pthread_mutex_unlock(&count_mutex);
+        context->parent_context->thread_nblock++;
+        nblock_ = context->parent_context->thread_nblock;
+        context->parent_context->num_output_bytes += cbytes;           /* update return bytes counter */
+        pthread_mutex_unlock(&context->parent_context->count_mutex);
         /* End of critical section */
 
         /* Copy the compressed buffer to destination */
@@ -1506,54 +1553,74 @@ static void *t_blosc(void *tids)
     } /* closes while (nblock_) */
 
     /* Sum up all the bytes decompressed */
-    if ((!compress || (flags & BLOSC_MEMCPYED)) && giveup_code > 0) {
+    if ((!compress || (flags & BLOSC_MEMCPYED)) && context->parent_context->thread_giveup_code > 0) {
       /* Update global counter for all threads (decompression only) */
-      pthread_mutex_lock(&count_mutex);
-      params.ntbytes += ntbytes;
-      pthread_mutex_unlock(&count_mutex);
+      pthread_mutex_lock(&context->parent_context->count_mutex);
+      context->parent_context->num_output_bytes += ntbytes;
+      pthread_mutex_unlock(&context->parent_context->count_mutex);
     }
 
     /* Meeting point for all threads (wait for finalization) */
-    WAIT_FINISH(NULL);
+    WAIT_FINISH(NULL, context->parent_context);
+  }
 
-  }  /* closes while(1) */
+  /* Cleanup our working space and context */
+  my_free(context->tmp);
+  my_free(context);
 
-  /* This should never be reached, but anyway */
   return(NULL);
 }
 
 
-static int init_threads(void)
+static int init_threads(struct blosc_context* context)
 {
   int32_t tid;
   int rc2;
+  int32_t ebsize;
+  struct thread_context* thread_context;
 
   /* Initialize mutex and condition variable objects */
-  pthread_mutex_init(&count_mutex, NULL);
+  pthread_mutex_init(&context->count_mutex, NULL);
+
+  /* Set context thread sentinels */
+  context->thread_giveup_code = 1;
+  context->thread_nblock = -1;
 
   /* Barrier initialization */
 #ifdef _POSIX_BARRIERS_MINE
-  pthread_barrier_init(&barr_init, NULL, nthreads+1);
-  pthread_barrier_init(&barr_finish, NULL, nthreads+1);
+  pthread_barrier_init(&context->barr_init, NULL, context->numthreads+1);
+  pthread_barrier_init(&context->barr_finish, NULL, context->numthreads+1);
 #else
-  pthread_mutex_init(&count_threads_mutex, NULL);
-  pthread_cond_init(&count_threads_cv, NULL);
-  count_threads = 0;      /* Reset threads counter */
+  pthread_mutex_init(&context->count_threads_mutex, NULL);
+  pthread_cond_init(&context->count_threads_cv, NULL);
+  context->count_threads = 0;      /* Reset threads counter */
 #endif
 
 #if !defined(_WIN32)
   /* Initialize and set thread detached attribute */
-  pthread_attr_init(&ct_attr);
-  pthread_attr_setdetachstate(&ct_attr, PTHREAD_CREATE_JOINABLE);
+  pthread_attr_init(&context->ct_attr);
+  pthread_attr_setdetachstate(&context->ct_attr, PTHREAD_CREATE_JOINABLE);
 #endif
 
   /* Finally, create the threads in detached state */
-  for (tid = 0; tid < nthreads; tid++) {
-    tids[tid] = tid;
+  for (tid = 0; tid < context->numthreads; tid++) {
+    context->tids[tid] = tid;
+
+    /* Create a thread context thread owns context (will destroy when finished) */
+    thread_context = (struct thread_context*)my_malloc(sizeof(struct thread_context));
+    thread_context->parent_context = context;
+    thread_context->tid = tid;
+
+    ebsize = context->blocksize + context->typesize * (int32_t)sizeof(int32_t);
+    thread_context->tmp = my_malloc(context->blocksize + ebsize + context->blocksize);
+    thread_context->tmp2 = thread_context->tmp + context->blocksize;
+    thread_context->tmp3 = thread_context->tmp + context->blocksize + ebsize;
+    thread_context->tmpblocksize = context->blocksize;
+
 #if !defined(_WIN32)
-    rc2 = pthread_create(&threads[tid], &ct_attr, t_blosc, (void *)&tids[tid]);
+    rc2 = pthread_create(&context->threads[tid], &context->ct_attr, t_blosc, (void *)thread_context);
 #else
-    rc2 = pthread_create(&threads[tid], NULL, t_blosc, (void *)&tids[tid]);
+    rc2 = pthread_create(&context->threads[tid], NULL, t_blosc, (void *)thread_context);
 #endif
     if (rc2) {
       fprintf(stderr, "ERROR; return code from pthread_create() is %d\n", rc2);
@@ -1562,100 +1629,58 @@ static int init_threads(void)
     }
   }
 
-  init_threads_done = 1;                 /* Initialization done! */
-  pid = (int)getpid();                   /* save the PID for this process */
 
   return(0);
 }
 
-void blosc_init(void) {
-  /* Init global lock  */
-  pthread_mutex_init(&global_comp_mutex, NULL);
-  init_lib = 1;
-}
-
 int blosc_set_nthreads(int nthreads_new)
 {
-  int ret;
+  int ret = g_threads;
 
-  /* Check if should initialize (implementing previous 1.2.3 behaviour,
-     where calling blosc_set_nthreads was enough) */
-  if (!init_lib) blosc_init();
-
-  /* Take global lock  */
-  pthread_mutex_lock(&global_comp_mutex);
-
-  ret = blosc_set_nthreads_(nthreads_new);
-  /* Release global lock  */
-  pthread_mutex_unlock(&global_comp_mutex);
+  if (nthreads_new != ret){
+    /* Re-initialize Blosc */
+    blosc_destroy();
+    blosc_init();
+    g_threads = nthreads_new;
+  }
 
   return ret;
 }
 
-int blosc_set_nthreads_(int nthreads_new)
+int blosc_set_nthreads_(struct blosc_context* context)
 {
-  int32_t nthreads_old = nthreads;
-  int32_t t;
-  int rc2;
-  void *status;
-
-  if (nthreads_new > BLOSC_MAX_THREADS) {
+  if (context->numthreads > BLOSC_MAX_THREADS) {
     fprintf(stderr,
             "Error.  nthreads cannot be larger than BLOSC_MAX_THREADS (%d)",
             BLOSC_MAX_THREADS);
     return -1;
   }
-  else if (nthreads_new <= 0) {
+  else if (context->numthreads <= 0) {
     fprintf(stderr, "Error.  nthreads must be a positive integer");
     return -1;
   }
 
-  /* Only join threads if they are not initialized or if our PID is
-     different from that in pid var (probably means that we are a
-     subprocess, and thus threads are non-existent). */
-  if (nthreads > 1 && init_threads_done && pid == getpid()) {
-      /* Tell all existing threads to finish */
-      end_threads = 1;
-      /* Synchronization point for all threads (wait for initialization) */
-      WAIT_INIT(-1);
-      /* Join exiting threads */
-      for (t=0; t<nthreads; t++) {
-        rc2 = pthread_join(threads[t], &status);
-        if (rc2) {
-          fprintf(stderr, "ERROR; return code from pthread_join() is %d\n", rc2);
-          fprintf(stderr, "\tError detail: %s\n", strerror(rc2));
-          return(-1);
-        }
-      }
-      init_threads_done = 0;
-      end_threads = 0;
-    }
-
-  /* Launch a new pool of threads (if necessary) */
-  nthreads = nthreads_new;
-  if (nthreads > 1 && (!init_threads_done || pid != getpid())) {
-    init_threads();
+  /* Launch a new pool of threads */
+  if (context->numthreads > 1 && context->numthreads != context->threads_started) {
+    blosc_release_threadpool(context);
+    init_threads(context);
   }
 
-  return nthreads_old;
+  /* We have now started the threads */
+  context->threads_started = context->numthreads;
+
+  return context->numthreads;
 }
 
 int blosc_set_compressor(const char *compname)
 {
-  int code;
+  int code = blosc_compname_to_compcode(compname);
 
-  /* Check if should initialize */
-  if (!init_lib) blosc_init();
+  g_compressor = code;
 
-  code = blosc_compname_to_compcode(compname);
-
-  /* Take global lock  */
-  pthread_mutex_lock(&global_comp_mutex);
-
-  compressor = code;
-
-  /* Release global lock  */
-  pthread_mutex_unlock(&global_comp_mutex);
+  /* Check if should initialize (implementing previous 1.2.3 behaviour,
+     where calling blosc_set_nthreads was enough) */
+  if (!g_initlib) blosc_init();
 
   return code;
 }
@@ -1682,12 +1707,22 @@ char* blosc_list_compressors(void)
   return ret;
 }
 
+char* blosc_get_version_string(void)
+{
+  static char ret[256];
+  strcpy(ret, BLOSC_VERSION_STRING);
+  return ret;
+}
+
 int blosc_get_complib_info(char *compname, char **complib, char **version)
 {
   int clibcode;
   char *clibname;
   char *clibversion = "unknown";
+
+  #if (defined(HAVE_LZ4) && defined(LZ4_VERSION_MAJOR)) || (defined(HAVE_SNAPPY) && defined(SNAPPY_VERSION))
   char sbuffer[256];
+  #endif
 
   clibcode = compname_to_clibcode(compname);
   clibname = clibcode_to_clibname(clibcode);
@@ -1724,70 +1759,6 @@ int blosc_get_complib_info(char *compname, char **complib, char **version)
   return clibcode;
 }
 
-/* Free possible memory temporaries and thread resources */
-int blosc_free_resources(void)
-{
-  int32_t t;
-  int rc2;
-  void *status;
-
-   /* Take global lock  */
-  pthread_mutex_lock(&global_comp_mutex);
-
-  /* Release temporaries */
-  if (init_temps_done) {
-    release_temporaries();
-  }
-
-  /* Finish the possible thread pool */
-  if (nthreads > 1 && init_threads_done) {
-    /* Tell all existing threads to finish */
-    end_threads = 1;
-    /* Synchronization point for all threads (wait for initialization) */
-    WAIT_INIT(-1);
-    /* Join exiting threads */
-    for (t=0; t<nthreads; t++) {
-      rc2 = pthread_join(threads[t], &status);
-      if (rc2) {
-        fprintf(stderr, "ERROR; return code from pthread_join() is %d\n", rc2);
-        fprintf(stderr, "\tError detail: %s\n", strerror(rc2));
-        return(-1);
-      }
-    }
-
-    /* Release mutex and condition variable objects */
-    pthread_mutex_destroy(&count_mutex);
-
-    /* Barriers */
-#ifdef _POSIX_BARRIERS_MINE
-    pthread_barrier_destroy(&barr_init);
-    pthread_barrier_destroy(&barr_finish);
-#else
-    pthread_mutex_destroy(&count_threads_mutex);
-    pthread_cond_destroy(&count_threads_cv);
-#endif
-
-    /* Thread attributes */
-#if !defined(_WIN32)
-    pthread_attr_destroy(&ct_attr);
-#endif
-
-    init_threads_done = 0;
-    end_threads = 0;
-  }
-   /* Release global lock  */
-  pthread_mutex_unlock(&global_comp_mutex);
-  return(0);
-
-}
-
-void blosc_destroy(void) {
-  /* Free the resources */
-  blosc_free_resources();
-  /* Destroy global lock */
-  pthread_mutex_destroy(&global_comp_mutex);
-}
-
 /* Return `nbytes`, `cbytes` and `blocksize` from a compressed buffer. */
 void blosc_cbuffer_sizes(const void *cbuffer, size_t *nbytes,
                          size_t *cbytes, size_t *blocksize)
@@ -1859,11 +1830,74 @@ char *blosc_cbuffer_complib(const void *cbuffer)
    blocksize will be used (the default). */
 void blosc_set_blocksize(size_t size)
 {
-  /* Take global lock  */
-  pthread_mutex_lock(&global_comp_mutex);
+  g_force_blocksize = (int32_t)size;
+}
 
-  force_blocksize = (int32_t)size;
+void blosc_init(void)
+{
+  pthread_mutex_init(&global_comp_mutex, NULL);
+  g_global_context = (struct blosc_context*)my_malloc(sizeof(struct blosc_context));
+  g_global_context->threads_started = 0;
+  g_initlib = 1;
+}
 
-   /* Release global lock  */
-  pthread_mutex_unlock(&global_comp_mutex);
+void blosc_destroy(void)
+{
+  g_initlib = 0;
+  blosc_release_threadpool(g_global_context);
+  my_free(g_global_context);
+  pthread_mutex_destroy(&global_comp_mutex);
+}
+
+int blosc_release_threadpool(struct blosc_context* context)
+{
+  int32_t t;
+  void* status;
+  int rc;
+  int rc2;
+
+  if (context->threads_started > 0)
+  {
+    /* Tell all existing threads to finish */
+    context->end_threads = 1;
+
+    /* Sync threads */
+    WAIT_INIT(-1, context);
+
+    /* Join exiting threads */
+    for (t=0; t<context->threads_started; t++) {
+      rc2 = pthread_join(context->threads[t], &status);
+      if (rc2) {
+        fprintf(stderr, "ERROR; return code from pthread_join() is %d\n", rc2);
+        fprintf(stderr, "\tError detail: %s\n", strerror(rc2));
+      }
+    }
+
+    /* Release mutex and condition variable objects */
+    pthread_mutex_destroy(&context->count_mutex);
+
+    /* Barriers */
+  #ifdef _POSIX_BARRIERS_MINE
+      pthread_barrier_destroy(&context->barr_init);
+      pthread_barrier_destroy(&context->barr_finish);
+  #else
+      pthread_mutex_destroy(&context->count_threads_mutex);
+      pthread_cond_destroy(&context->count_threads_cv);
+  #endif
+
+      /* Thread attributes */
+  #if !defined(_WIN32)
+      pthread_attr_destroy(&context->ct_attr);
+  #endif
+
+  }
+
+  context->threads_started = 0;
+
+  return 0;
+}
+
+int blosc_free_resources(void)
+{
+  return blosc_release_threadpool(g_global_context);
 }
diff --git a/c-blosc/blosc/blosc.h b/c-blosc/blosc/blosc.h
index ca611f4f0..7dd1db99b 100644
--- a/c-blosc/blosc/blosc.h
+++ b/c-blosc/blosc/blosc.h
@@ -1,30 +1,31 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <francesc@blosc.io>
+  Author: Francesc Alted <francesc@blosc.org>
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
+#ifndef BLOSC_H
+#define BLOSC_H
 
 #include <limits.h>
+#include <stdlib.h>
+#include "blosc-export.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
-#ifndef BLOSC_H
-#define BLOSC_H
-
 /* Version numbers */
 #define BLOSC_VERSION_MAJOR    1    /* for major interface/format changes  */
-#define BLOSC_VERSION_MINOR    4    /* for minor interface/format changes  */
-#define BLOSC_VERSION_RELEASE  4    /* for tweaks, bug-fixes, or development */
+#define BLOSC_VERSION_MINOR    8    /* for minor interface/format changes  */
+#define BLOSC_VERSION_RELEASE  1    /* for tweaks, bug-fixes, or development */
 
-#define BLOSC_VERSION_STRING   "1.4.4"  /* string version.  Sync with above! */
+#define BLOSC_VERSION_STRING   "1.8.1"  /* string version.  Sync with above! */
 #define BLOSC_VERSION_REVISION "$Rev$"   /* revision version */
-#define BLOSC_VERSION_DATE     "$Date:: 2015-05-05 #$"    /* date version */
+#define BLOSC_VERSION_DATE     "$Date:: 2016-04-08 #$"    /* date version */
 
-#define BLOSCLZ_VERSION_STRING "1.0.2.1"   /* the internal compressor version */
+#define BLOSCLZ_VERSION_STRING "1.0.5"   /* the internal compressor version */
 
 /* The *_FORMAT symbols should be just 1-byte long */
 #define BLOSC_VERSION_FORMAT    2   /* Blosc format version, starting at 1 */
@@ -37,18 +38,24 @@ extern "C" {
    implementations */
 #define BLOSC_MAX_OVERHEAD BLOSC_MIN_HEADER_LENGTH
 
-/* Maximum buffer size to be compressed */
+/* Maximum source buffer size to be compressed */
 #define BLOSC_MAX_BUFFERSIZE (INT_MAX - BLOSC_MAX_OVERHEAD)
 
-/* Maximum typesize before considering buffer as a stream of bytes */
+/* Maximum typesize before considering source buffer as a stream of bytes */
 #define BLOSC_MAX_TYPESIZE 255         /* Cannot be larger than 255 */
 
 /* The maximum number of threads (for some static arrays) */
 #define BLOSC_MAX_THREADS 256
 
+/* Codes for shuffling (see blosc_compress) */
+#define BLOSC_NOSHUFFLE   0  /* no shuffle */
+#define BLOSC_SHUFFLE 1      /* byte-wise shuffle */
+#define BLOSC_BITSHUFFLE  2  /* bit-wise shuffle */
+
 /* Codes for internal flags (see blosc_cbuffer_metainfo) */
-#define BLOSC_DOSHUFFLE 0x1
-#define BLOSC_MEMCPYED  0x2
+#define BLOSC_DOSHUFFLE    0x1	/* byte-wise shuffle */
+#define BLOSC_MEMCPYED     0x2	/* plain copy */
+#define BLOSC_DOBITSHUFFLE 0x4  /* bit-wise shuffle */
 
 /* Codes for the different compressors shipped with Blosc */
 #define BLOSC_BLOSCLZ   0
@@ -64,7 +71,7 @@ extern "C" {
 #define BLOSC_SNAPPY_COMPNAME    "snappy"
 #define BLOSC_ZLIB_COMPNAME      "zlib"
 
-/* Codes for the different compression libraries shipped with Blosc */
+/* Codes for compression libraries shipped with Blosc (code must be < 8) */
 #define BLOSC_BLOSCLZ_LIB   0
 #define BLOSC_LZ4_LIB       1
 #define BLOSC_SNAPPY_LIB    2
@@ -76,7 +83,7 @@ extern "C" {
 #define BLOSC_SNAPPY_LIBNAME    "Snappy"
 #define BLOSC_ZLIB_LIBNAME      "Zlib"
 
-/* The codes for compressor formats shipped with Blosc (code must be < 8) */
+/* The codes for compressor formats shipped with Blosc */
 #define BLOSC_BLOSCLZ_FORMAT  BLOSC_BLOSCLZ_LIB
 #define BLOSC_LZ4_FORMAT      BLOSC_LZ4_LIB
     /* LZ4HC and LZ4 share the same format */
@@ -95,20 +102,24 @@ extern "C" {
 
 
 /**
-  Initialize the Blosc library. You must call this previous to any
-  other Blosc call, and make sure that you call this in a non-threaded
-  environment.  Other Blosc calls can be called in a threaded
-  environment, if desired.
+  Initialize the Blosc library environment.
+
+  You must call this previous to any other Blosc call, unless you want
+  Blosc to be used simultaneously in a multi-threaded environment, in
+  which case you should *exclusively* use the
+  blosc_compress_ctx()/blosc_decompress_ctx() pair (see below).
   */
-void blosc_init(void);
+BLOSC_EXPORT void blosc_init(void);
 
 
 /**
-  Destroy the Blosc library environment. You must call this after to
-  you are done with all the Blosc calls, and make sure that you call
-  this in a non-threaded environment.
+  Destroy the Blosc library environment.
+
+  You must call this after to you are done with all the Blosc calls,
+  unless you have not used blosc_init() before (see blosc_init()
+  above).
   */
-void blosc_destroy(void);
+BLOSC_EXPORT void blosc_destroy(void);
 
 
 /**
@@ -120,8 +131,9 @@ void blosc_destroy(void);
   between 0 (no compression) and 9 (maximum compression).
 
   `doshuffle` specifies whether the shuffle compression preconditioner
-  should be applied or not.  0 means not applying it and 1 means
-  applying it.
+  should be applied or not.  BLOSC_NOSHUFFLE means not applying it,
+  BLOSC_SHUFFLE means applying it at a byte level and BLOSC_BITSHUFFLE
+  at a bit level (slower but may achieve better entropy alignment).
 
   `typesize` is the number of bytes for the atomic type in binary
   `src` buffer.  This is mainly useful for the shuffle preconditioner.
@@ -145,10 +157,35 @@ void blosc_destroy(void);
   should never happen.  If you see this, please report it back
   together with the buffer data causing this and compression settings.
   */
-int blosc_compress(int clevel, int doshuffle, size_t typesize, size_t nbytes,
-                   const void *src, void *dest, size_t destsize);
+BLOSC_EXPORT int blosc_compress(int clevel, int doshuffle, size_t typesize,
+				size_t nbytes, const void *src, void *dest,
+				size_t destsize);
 
 
+/**
+  Context interface to blosc compression. This does not require a call
+  to blosc_init() and can be called from multithreaded applications
+  without the global lock being used, so allowing Blosc be executed
+  simultaneously in those scenarios.
+
+  It uses the same parameters than the blosc_compress() function plus:
+
+  `compressor`: the string representing the type of compressor to use.
+
+  `blocksize`: the requested size of the compressed blocks.  If 0, an
+   automatic blocksize will be used.
+
+  `numinternalthreads`: the number of threads to use internally.
+
+  A negative return value means that an internal error happened.  This
+  should never happen.  If you see this, please report it back
+  together with the buffer data causing this and compression settings.
+*/
+BLOSC_EXPORT int blosc_compress_ctx(int clevel, int doshuffle, size_t typesize,
+				    size_t nbytes, const void* src, void* dest,
+				    size_t destsize, const char* compressor,
+				    size_t blocksize, int numinternalthreads);
+
 /**
   Decompress a block of compressed data in `src`, put the result in
   `dest` and returns the size of the decompressed block.
@@ -161,10 +198,30 @@ int blosc_compress(int clevel, int doshuffle, size_t typesize, size_t nbytes,
   If an error occurs, e.g. the compressed data is corrupted or the
   output buffer is not large enough, then 0 (zero) or a negative value
   will be returned instead.
-  */
-int blosc_decompress(const void *src, void *dest, size_t destsize);
+*/
+BLOSC_EXPORT int blosc_decompress(const void *src, void *dest, size_t destsize);
 
 
+/**
+  Context interface to blosc decompression. This does not require a
+  call to blosc_init() and can be called from multithreaded
+  applications without the global lock being used, so allowing Blosc
+  be executed simultaneously in those scenarios.
+
+  It uses the same parameters than the blosc_decompress() function plus:
+
+  `numinternalthreads`: number of threads to use internally.
+
+  Decompression is memory safe and guaranteed not to write the `dest`
+  buffer more than what is specified in `destsize`.
+
+  If an error occurs, e.g. the compressed data is corrupted or the
+  output buffer is not large enough, then 0 (zero) or a negative value
+  will be returned instead.
+*/
+BLOSC_EXPORT int blosc_decompress_ctx(const void *src, void *dest,
+                                          size_t destsize, int numinternalthreads);
+
 /**
   Get `nitems` (of typesize size) in `src` buffer starting in `start`.
   The items are returned in `dest` buffer, which has to have enough
@@ -173,7 +230,7 @@ int blosc_decompress(const void *src, void *dest, size_t destsize);
   Returns the number of bytes copied to `dest` or a negative value if
   some error happens.
   */
-int blosc_getitem(const void *src, int start, int nitems, void *dest);
+BLOSC_EXPORT int blosc_getitem(const void *src, int start, int nitems, void *dest);
 
 
 /**
@@ -184,7 +241,7 @@ int blosc_getitem(const void *src, int start, int nitems, void *dest);
 
   Returns the previous number of threads.
   */
-int blosc_set_nthreads(int nthreads);
+BLOSC_EXPORT int blosc_set_nthreads(int nthreads);
 
 
 /**
@@ -196,7 +253,7 @@ int blosc_set_nthreads(int nthreads);
   for it in this build, it returns a -1.  Else it returns the code for
   the compressor (>=0).
   */
-int blosc_set_compressor(const char* compname);
+BLOSC_EXPORT int blosc_set_compressor(const char* compname);
 
 
 /**
@@ -206,7 +263,7 @@ int blosc_set_compressor(const char* compname);
   for it in this build, -1 is returned.  Else, the compressor code is
   returned.
  */
-int blosc_compcode_to_compname(int compcode, char **compname);
+BLOSC_EXPORT int blosc_compcode_to_compname(int compcode, char **compname);
 
 
 /**
@@ -215,7 +272,7 @@ int blosc_compcode_to_compname(int compcode, char **compname);
   If the compressor name is not recognized, or there is not support
   for it in this build, -1 is returned instead.
  */
-int blosc_compname_to_compcode(const char *compname);
+BLOSC_EXPORT int blosc_compname_to_compcode(const char *compname);
 
 
 /**
@@ -229,7 +286,14 @@ int blosc_compname_to_compcode(const char *compname);
 
   This function should always succeed.
   */
-char* blosc_list_compressors(void);
+BLOSC_EXPORT char* blosc_list_compressors(void);
+
+/**
+  Return the version of blosc in string format.
+
+  Useful for dynamic libraries.
+*/
+BLOSC_EXPORT char* blosc_get_version_string(void);
 
 
 /**
@@ -246,7 +310,7 @@ char* blosc_list_compressors(void);
   If the compressor is supported, it returns the code for the library
   (>=0).  If it is not supported, this function returns -1.
   */
-int blosc_get_complib_info(char *compname, char **complib, char **version);
+BLOSC_EXPORT int blosc_get_complib_info(char *compname, char **complib, char **version);
 
 
 /**
@@ -255,7 +319,7 @@ int blosc_get_complib_info(char *compname, char **complib, char **version);
   problems releasing the resources, it returns a negative number, else
   it returns 0.
   */
-int blosc_free_resources(void);
+BLOSC_EXPORT int blosc_free_resources(void);
 
 
 /**
@@ -269,8 +333,8 @@ int blosc_free_resources(void);
 
   This function should always succeed.
   */
-void blosc_cbuffer_sizes(const void *cbuffer, size_t *nbytes,
-                         size_t *cbytes, size_t *blocksize);
+BLOSC_EXPORT void blosc_cbuffer_sizes(const void *cbuffer, size_t *nbytes,
+				      size_t *cbytes, size_t *blocksize);
 
 
 /**
@@ -281,14 +345,15 @@ void blosc_cbuffer_sizes(const void *cbuffer, size_t *nbytes,
     * bit 0: whether the shuffle filter has been applied or not
     * bit 1: whether the internal buffer is a pure memcpy or not
 
-  You can use the `BLOSC_DOSHUFFLE` and `BLOSC_MEMCPYED` symbols for
-  extracting the interesting bits (e.g. ``flags & BLOSC_DOSHUFFLE``
-  says whether the buffer is shuffled or not).
+  You can use the `BLOSC_DOSHUFFLE`, `BLOSC_DOBITSHUFFLE` and
+  `BLOSC_MEMCPYED` symbols for extracting the interesting bits
+  (e.g. ``flags & BLOSC_DOSHUFFLE`` says whether the buffer is
+  byte-shuffled or not).
 
   This function should always succeed.
   */
-void blosc_cbuffer_metainfo(const void *cbuffer, size_t *typesize,
-                            int *flags);
+BLOSC_EXPORT void blosc_cbuffer_metainfo(const void *cbuffer, size_t *typesize,
+					 int *flags);
 
 
 /**
@@ -298,8 +363,8 @@ void blosc_cbuffer_metainfo(const void *cbuffer, size_t *typesize,
 
   This function should always succeed.
   */
-void blosc_cbuffer_versions(const void *cbuffer, int *version,
-                            int *versionlz);
+BLOSC_EXPORT void blosc_cbuffer_versions(const void *cbuffer, int *version,
+                                             int *versionlz);
 
 
 /**
@@ -307,7 +372,7 @@ void blosc_cbuffer_versions(const void *cbuffer, int *version,
 
   This function should always succeed.
   */
-char *blosc_cbuffer_complib(const void *cbuffer);
+BLOSC_EXPORT char *blosc_cbuffer_complib(const void *cbuffer);
 
 
 
@@ -322,7 +387,7 @@ char *blosc_cbuffer_complib(const void *cbuffer);
   Force the use of a specific blocksize.  If 0, an automatic
   blocksize will be used (the default).
   */
-void blosc_set_blocksize(size_t blocksize);
+BLOSC_EXPORT void blosc_set_blocksize(size_t blocksize);
 
 #ifdef __cplusplus
 }
diff --git a/c-blosc/blosc/blosclz.c b/c-blosc/blosc/blosclz.c
index 4106eb29a..d5be92b63 100644
--- a/c-blosc/blosc/blosclz.c
+++ b/c-blosc/blosc/blosclz.c
@@ -1,7 +1,7 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <francesc@blosc.io>
+  Author: Francesc Alted <francesc@blosc.org>
   Creation date: 2009-05-20
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
@@ -20,7 +20,17 @@
 
 #if defined(_WIN32) && !defined(__MINGW32__)
   #include <windows.h>
-  #include "win32/stdint-windows.h"
+
+  /* stdint.h only available in VS2010 (VC++ 16.0) and newer */
+  #if defined(_MSC_VER) && _MSC_VER < 1600
+    #include "win32/stdint-windows.h"
+  #else
+    #include <stdint.h>
+  #endif
+  /* llabs only available in VS2013 (VC++ 18.0) and newer */
+  #if defined(_MSC_VER) && _MSC_VER < 1800
+    #define llabs(v) abs(v)
+  #endif
 #else
   #include <stdint.h>
 #endif  /* _WIN32 */
@@ -35,7 +45,7 @@
 #undef BLOSCLZ_STRICT_ALIGN
 #elif defined(__i486__) || defined(__i586__) || defined(__i686__)  /* GNU C */
 #undef BLOSCLZ_STRICT_ALIGN
-#elif defined(_M_IX86) /* Intel, MSVC */
+#elif defined(_M_IX86) || defined(_M_X64)   /* Intel, MSVC */
 #undef BLOSCLZ_STRICT_ALIGN
 #elif defined(__386)
 #undef BLOSCLZ_STRICT_ALIGN
@@ -44,8 +54,8 @@
 #elif defined(__I86__) /* Digital Mars */
 #undef BLOSCLZ_STRICT_ALIGN
 /* Seems like unaligned access in ARM (at least ARMv6) is pretty
-   expensive, so we are always to enfor strict aligment in ARM.  If
-   anybody suggest that newer ARMs are better, we can revisit this. */
+   expensive, so we are going to always enforce strict aligment in ARM.
+   If anybody suggest that newer ARMs are better, we can revisit this. */
 /* #elif defined(__ARM_FEATURE_UNALIGNED) */  /* ARM, GNU C */
 /* #undef BLOSCLZ_STRICT_ALIGN */
 #endif
@@ -76,7 +86,6 @@
 #endif
 
 #define MAX_COPY       32
-#define MAX_LEN       264  /* 256 + 8 */
 #define MAX_DISTANCE 8191
 #define MAX_FARDISTANCE (65535+MAX_DISTANCE-1)
 
@@ -87,21 +96,85 @@
 #endif
 
 
-static inline int32_t hash_function(uint8_t* p, uint8_t hash_log)
-{
-  int32_t v;
+/*
+ * Fast copy macros
+ */
+#if defined(_WIN32)
+  #define CPYSIZE              32
+#else
+  #define CPYSIZE              8
+#endif
+#define MCPY(d,s)            { memcpy(d, s, CPYSIZE); d+=CPYSIZE; s+=CPYSIZE; }
+#define FASTCOPY(d,s,e)      { do { MCPY(d,s) } while (d<e); }
+#define SAFECOPY(d,s,e)      { while (d<e) { MCPY(d,s) } }
+
+/* Copy optimized for copying in blocks */
+#define BLOCK_COPY(op, ref, len, op_limit)    \
+{ int ilen = len % CPYSIZE;                   \
+  uint8_t *cpy = op + len;                    \
+  if (cpy + CPYSIZE - ilen <= op_limit) {     \
+    FASTCOPY(op, ref, cpy);                   \
+    ref -= (op-cpy); op = cpy;                \
+  }                                           \
+  else {                                      \
+    cpy -= ilen;                              \
+    SAFECOPY(op, ref, cpy);                   \
+    ref -= (op-cpy); op = cpy;                \
+    for(; ilen; --ilen)	                      \
+        *op++ = *ref++;                       \
+  }                                           \
+}
 
-  v = BLOSCLZ_READU16(p);
-  v ^= BLOSCLZ_READU16(p+1)^(v>>(16-hash_log));
-  v &= (1 << hash_log) - 1;
-  return v;
+#define SAFE_COPY(op, ref, len, op_limit)     \
+if (llabs(op-ref) < CPYSIZE) {                \
+  for(; len; --len)                           \
+    *op++ = *ref++;                           \
+}                                             \
+else BLOCK_COPY(op, ref, len, op_limit);
+
+/* Copy optimized for GCC 4.8.  Seems like long copy loops are optimal. */
+#define GCC_SAFE_COPY(op, ref, len, op_limit) \
+if ((len > 32) || (llabs(op-ref) < CPYSIZE)) { \
+  for(; len; --len)                           \
+    *op++ = *ref++;                           \
+}                                             \
+else BLOCK_COPY(op, ref, len, op_limit);
+
+/* Simple, but pretty effective hash function for 3-byte sequence */
+#define HASH_FUNCTION(v, p, l) {                       \
+    v = BLOSCLZ_READU16(p);                            \
+    v ^= BLOSCLZ_READU16(p + 1) ^ ( v >> (16 - l));    \
+    v &= (1 << l) - 1;                                 \
 }
 
+/* Another version which seems to be a bit more effective than the above,
+ * but a bit slower.  Could be interesting for high opt_level.
+ */
+#define MINMATCH 3
+#define HASH_FUNCTION2(v, p, l) {                       \
+  v = BLOSCLZ_READU16(p);				\
+  v = (v * 2654435761U) >> ((MINMATCH * 8) - (l + 1));  \
+  v &= (1 << l) - 1;					\
+}
+
+#define LITERAL(ip, op, op_limit, anchor, copy) {        \
+  if (BLOSCLZ_UNEXPECT_CONDITIONAL(op+2 > op_limit))     \
+    goto out;                                            \
+  *op++ = *anchor++;                                     \
+  ip = anchor;                                           \
+  copy++;                                                \
+  if(BLOSCLZ_UNEXPECT_CONDITIONAL(copy == MAX_COPY)) {   \
+    copy = 0;                                            \
+    *op++ = MAX_COPY-1;                                  \
+  }                                                      \
+  continue;                                              \
+}
 
 #define IP_BOUNDARY 2
 
-int blosclz_compress(int opt_level, const void* input,
-                     int length, void* output, int maxout)
+
+int blosclz_compress(const int opt_level, const void* input, int length,
+                     void* output, int maxout, int accel)
 {
   uint8_t* ip = (uint8_t*) input;
   uint8_t* ibase = (uint8_t*) input;
@@ -116,7 +189,7 @@ int blosclz_compress(int opt_level, const void* input,
      and taking the minimum times on a i5-3380M @ 2.90GHz.
      Curiously enough, values >= 14 does not always
      get maximum compression, even with large blocksizes. */
-  int8_t hash_log_[10] = {-1, 11, 12, 13, 14, 13, 13, 13, 13, 13};
+  int8_t hash_log_[10] = {-1, 11, 11, 11, 12, 13, 13, 13, 13, 13};
   uint8_t hash_log = hash_log_[opt_level];
   uint16_t hash_size = 1 << hash_log;
   uint16_t *htab;
@@ -125,7 +198,7 @@ int blosclz_compress(int opt_level, const void* input,
   int32_t hval;
   uint8_t copy;
 
-  double maxlength_[10] = {-1, .1, .15, .2, .5, .7, .85, .925, .975, 1.0};
+  double maxlength_[10] = {-1, .1, .15, .2, .3, .45, .6, .75, .9, 1.0};
   int32_t maxlength = (int32_t) (length * maxlength_[opt_level]);
   if (maxlength > (int32_t) maxout) {
     maxlength = (int32_t) maxout;
@@ -133,25 +206,15 @@ int blosclz_compress(int opt_level, const void* input,
   op_limit = op + maxlength;
 
   /* output buffer cannot be less than 66 bytes or we can get into trouble */
-  if (maxlength < 66) {
-    return 0;                   /* mark this as uncompressible */
+  if (BLOSCLZ_UNEXPECT_CONDITIONAL(maxlength < 66 || length < 4)) {
+    return 0;
   }
 
-  htab = (uint16_t *) calloc(hash_size, sizeof(uint16_t));
+  /* prepare the acceleration to be used in condition */
+  accel = accel < 1 ? 1 : accel;
+  accel -= 1;
 
-  /* sanity check */
-  if(BLOSCLZ_UNEXPECT_CONDITIONAL(length < 4)) {
-    if(length) {
-      /* create literal copy only */
-      *op++ = length-1;
-      ip_bound++;
-      while(ip <= ip_bound)
-        *op++ = *ip++;
-      free(htab);
-      return length+1;
-    }
-    else goto out;
-  }
+  htab = (uint16_t *) calloc(hash_size, sizeof(uint16_t));
 
   /* we start with literal copy */
   copy = 2;
@@ -175,23 +238,25 @@ int blosclz_compress(int opt_level, const void* input,
     }
 
     /* find potential match */
-    hval = hash_function(ip, hash_log);
+    HASH_FUNCTION(hval, ip, hash_log);
     ref = ibase + htab[hval];
-    /* update hash table */
-    htab[hval] = (uint16_t)(anchor - ibase);
 
     /* calculate distance to the match */
     distance = (int32_t)(anchor - ref);
 
+    /* update hash table if necessary */
+    if ((distance & accel) == 0)
+      htab[hval] = (uint16_t)(anchor - ibase);
+
     /* is this a match? check the first 3 bytes */
     if (distance==0 || (distance >= MAX_FARDISTANCE) ||
         *ref++ != *ip++ || *ref++!=*ip++ || *ref++!=*ip++)
-      goto literal;
+      LITERAL(ip, op, op_limit, anchor, copy);
 
     /* far, needs at least 5-byte match */
-    if (distance >= MAX_DISTANCE) {
+    if (opt_level >= 5 && distance >= MAX_DISTANCE) {
       if (*ip++ != *ref++ || *ip++ != *ref++)
-        goto literal;
+        LITERAL(ip, op, op_limit, anchor, copy);
       len += 2;
     }
 
@@ -238,7 +303,6 @@ int blosclz_compress(int opt_level, const void* input,
       for(;;) {
         /* safe because the outer check against ip limit */
         while (ip < (ip_bound - (sizeof(int64_t) - IP_BOUNDARY))) {
-          if (*ref++ != *ip++) break;
 #if !defined(BLOSCLZ_STRICT_ALIGN)
           if (((int64_t *)ref)[0] != ((int64_t *)ip)[0]) {
 #endif
@@ -315,25 +379,13 @@ int blosclz_compress(int opt_level, const void* input,
     }
 
     /* update the hash at match boundary */
-    hval = hash_function(ip, hash_log);
+    HASH_FUNCTION(hval, ip, hash_log);
     htab[hval] = (uint16_t)(ip++ - ibase);
-    hval = hash_function(ip, hash_log);
+    HASH_FUNCTION(hval, ip, hash_log);
     htab[hval] = (uint16_t)(ip++ - ibase);
 
     /* assuming literal copy */
     *op++ = MAX_COPY-1;
-
-    continue;
-
-  literal:
-    if (BLOSCLZ_UNEXPECT_CONDITIONAL(op+2 > op_limit)) goto out;
-    *op++ = *anchor++;
-    ip = anchor;
-    copy++;
-    if(BLOSCLZ_UNEXPECT_CONDITIONAL(copy == MAX_COPY)) {
-      copy = 0;
-      *op++ = MAX_COPY-1;
-    }
   }
 
   /* left-over as literal copy */
@@ -366,7 +418,6 @@ int blosclz_compress(int opt_level, const void* input,
 
 }
 
-
 int blosclz_decompress(const void* input, int length, void* output, int maxout)
 {
   const uint8_t* ip = (const uint8_t*) input;
@@ -377,7 +428,7 @@ int blosclz_decompress(const void* input, int length, void* output, int maxout)
   int32_t loop = 1;
 
   do {
-    const uint8_t* ref = op;
+    uint8_t* ref = op;
     int32_t len = ctrl >> 5;
     int32_t ofs = (ctrl & 31) << 8;
 
@@ -426,21 +477,11 @@ int blosclz_decompress(const void* input, int length, void* output, int maxout)
         /* copy from reference */
         ref--;
         len += 3;
-        if (abs((int32_t)(ref-op)) <= (int32_t)len) {
-          /* src and dst do overlap: do a loop */
-          for(; len; --len)
-            *op++ = *ref++;
-          /* The memmove below does not work well (don't know why) */
-          /* memmove(op, ref, len);
-             op += len;
-             ref += len;
-             len = 0; */
-        }
-        else {
-          memcpy(op, ref, len);
-          op += len;
-          ref += len;
-        }
+#if !defined(_WIN32) && ((defined(__GNUC__) || defined(__INTEL_COMPILER) || !defined(__clang__)))
+        GCC_SAFE_COPY(op, ref, len, op_limit);
+#else
+        SAFE_COPY(op, ref, len, op_limit);
+#endif
       }
     }
     else {
@@ -454,9 +495,7 @@ int blosclz_decompress(const void* input, int length, void* output, int maxout)
       }
 #endif
 
-      memcpy(op, ip, ctrl);
-      ip += ctrl;
-      op += ctrl;
+      BLOCK_COPY(op, ip, ctrl, op_limit);
 
       loop = (int32_t)BLOSCLZ_EXPECT_CONDITIONAL(ip < ip_limit);
       if(loop)
diff --git a/c-blosc/blosc/blosclz.h b/c-blosc/blosc/blosclz.h
index 509db1943..792a9b64c 100644
--- a/c-blosc/blosc/blosclz.h
+++ b/c-blosc/blosc/blosclz.h
@@ -1,7 +1,7 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <francesc@blosc.io>
+  Author: Francesc Alted <francesc@blosc.org>
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
@@ -32,11 +32,16 @@ extern "C" {
   bytes, the return value will be 0 and you will have to discard the
   output buffer.
 
+  The acceleration parameter is related with the frequency for
+  updating the internal hash.  An acceleration of 1 means that the
+  internal hash is updated at full rate.  A value < 1 is not allowed
+  and will be silently set to 1.
+
   The input buffer and the output buffer can not overlap.
 */
 
-int blosclz_compress(int opt_level, const void* input, int length,
-                     void* output, int maxout);
+int blosclz_compress(const int opt_level, const void* input, int length,
+                     void* output, int maxout, int accel);
 
 /**
   Decompress a block of compressed data and returns the size of the
diff --git a/c-blosc/blosc/config.h.in b/c-blosc/blosc/config.h.in
index 6689769f2..552c5c82d 100644
--- a/c-blosc/blosc/config.h.in
+++ b/c-blosc/blosc/config.h.in
@@ -4,6 +4,7 @@
 #cmakedefine HAVE_LZ4 @HAVE_LZ4@
 #cmakedefine HAVE_SNAPPY @HAVE_SNAPPY@
 #cmakedefine HAVE_ZLIB @HAVE_ZLIB@
+#cmakedefine BLOSC_DLL_EXPORT @DLL_EXPORT@
 
 
 #endif
diff --git a/c-blosc/blosc/shuffle-avx2.c b/c-blosc/blosc/shuffle-avx2.c
new file mode 100644
index 000000000..404d2fd8e
--- /dev/null
+++ b/c-blosc/blosc/shuffle-avx2.c
@@ -0,0 +1,757 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "shuffle-generic.h"
+#include "shuffle-avx2.h"
+
+/* Make sure AVX2 is available for the compilation target and compiler. */
+#if !defined(__AVX2__)
+  #error AVX2 is not supported by the target architecture/platform and/or this compiler.
+#endif
+
+#include <immintrin.h>
+
+
+/* The next is useful for debugging purposes */
+#if 0
+#include <stdio.h>
+#include <string.h>
+
+static void printymm(__m256i ymm0)
+{
+  uint8_t buf[32];
+
+  ((__m256i *)buf)[0] = ymm0;
+  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
+          buf[0], buf[1], buf[2], buf[3],
+          buf[4], buf[5], buf[6], buf[7],
+          buf[8], buf[9], buf[10], buf[11],
+          buf[12], buf[13], buf[14], buf[15],
+          buf[16], buf[17], buf[18], buf[19],
+          buf[20], buf[21], buf[22], buf[23],
+          buf[24], buf[25], buf[26], buf[27],
+          buf[28], buf[29], buf[30], buf[31]);
+}
+#endif
+
+/* GCC doesn't include the split load/store intrinsics
+   needed for the tiled shuffle, so define them here. */
+#if defined(__GNUC__) && !defined(__clang__)
+static inline __m256i
+__attribute__((__always_inline__))
+_mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr)
+{
+  return _mm256_inserti128_si256(
+    _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
+}
+
+static inline void
+__attribute__((__always_inline__))
+_mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a)
+{
+  _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
+  _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
+}
+#endif  /* defined(__GNUC__) */
+
+/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
+static void
+shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 2;
+  size_t j;
+  int k;
+  __m256i ymm0[2], ymm1[2];
+
+  /* Create the shuffle mask.
+     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
+     most to least significant (i.e., their order is reversed when compared to
+     loading the mask from an array). */
+  const __m256i shmask = _mm256_set_epi8(
+    0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
+    0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
+    0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
+    0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
+
+  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
+    /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
+    for (k = 0; k < 2; k++) {
+      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
+      ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
+    }
+
+    ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
+    ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
+
+    ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
+    ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
+    ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
+
+    /* Store the result vectors */
+    uint8_t* const dest_for_jth_element = dest + j;
+    for (k = 0; k < 2; k++) {
+      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
+    }
+  }
+}
+
+/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
+static void
+shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 4;
+  size_t i;
+  int j;
+  __m256i ymm0[4], ymm1[4];
+
+  /* Create the shuffle mask.
+     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
+     most to least significant (i.e., their order is reversed when compared to
+     loading the mask from an array). */
+  const __m256i mask = _mm256_set_epi32(
+    0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
+    /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
+    for (j = 0; j < 4; j++) {
+      ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
+      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
+      ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
+      ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
+      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
+      ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
+    }
+    /* Transpose double words */
+    for (j = 0; j < 2; j++) {
+      ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
+      ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
+    }
+    /* Transpose quad words */
+    for (j = 0; j < 2; j++) {
+      ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]);
+      ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]);
+    }
+    for (j = 0; j < 4; j++) {
+      ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
+    }
+    /* Store the result vectors */
+    uint8_t* const dest_for_ith_element = dest + i;
+    for (j = 0; j < 4; j++) {
+      _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
+    }
+  }
+}
+
+/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
+static void
+shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 8;
+  size_t j;
+  int k, l;
+  __m256i ymm0[8], ymm1[8];
+
+  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
+    /* Fetch 32 elements (256 bytes) then transpose bytes. */
+    for (k = 0; k < 8; k++) {
+      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
+      ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
+      ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
+    }
+    /* Transpose words */
+    for (k = 0, l = 0; k < 4; k++, l +=2) {
+      ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+1]);
+      ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+1]);
+    }
+    /* Transpose double words */
+    for (k = 0, l = 0; k < 4; k++, l++) {
+      if (k == 2) l += 2;
+      ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+2]);
+      ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+2]);
+    }
+    /* Transpose quad words */
+    for (k = 0; k < 4; k++) {
+      ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+4]);
+      ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+4]);
+    }
+    for(k = 0; k < 8; k++) {
+      ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
+      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
+      ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
+    }
+    /* Store the result vectors */
+    uint8_t* const dest_for_jth_element = dest + j;
+    for (k = 0; k < 8; k++) {
+      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
+    }
+  }
+}
+
+/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
+static void
+shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 16;
+  size_t j;
+  int k, l;
+  __m256i ymm0[16], ymm1[16];
+
+  /* Create the shuffle mask.
+     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
+     most to least significant (i.e., their order is reversed when compared to
+     loading the mask from an array). */
+  const __m256i shmask = _mm256_set_epi8(
+    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
+    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
+    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
+    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
+
+  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
+    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
+    for (k = 0; k < 16; k++) {
+      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
+    }
+    /* Transpose bytes */
+    for (k = 0, l = 0; k < 8; k++, l +=2) {
+      ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]);
+      ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]);
+    }
+    /* Transpose words */
+    for (k = 0, l = -2; k < 8; k++, l++) {
+      if ((k%2) == 0) l += 2;
+      ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]);
+      ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]);
+    }
+    /* Transpose double words */
+    for (k = 0, l = -4; k < 8; k++, l++) {
+      if ((k%4) == 0) l += 4;
+      ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]);
+      ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]);
+    }
+    /* Transpose quad words */
+    for (k = 0; k < 8; k++) {
+      ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]);
+      ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]);
+    }
+    for (k = 0; k < 16; k++) {
+      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
+      ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
+    }
+    /* Store the result vectors */
+    uint8_t* const dest_for_jth_element = dest + j;
+    for (k = 0; k < 16; k++) {
+      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
+    }
+  }
+}
+
+/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
+static void
+shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
+{
+  size_t j;
+  int k, l;
+  __m256i ymm0[16], ymm1[16];
+
+  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
+
+  /* Create the shuffle mask.
+     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
+     most to least significant (i.e., their order is reversed when compared to
+     loading the mask from an array). */
+  const __m256i shmask = _mm256_set_epi8(
+    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
+    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
+    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
+    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
+
+  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
+    /* Advance the offset into the type by the vector size (in bytes), unless this is
+    the initial iteration and the type size is not a multiple of the vector size.
+    In that case, only advance by the number of bytes necessary so that the number
+    of remaining bytes in the type will be a multiple of the vector size. */
+    size_t offset_into_type;
+    for (offset_into_type = 0; offset_into_type < bytesoftype;
+      offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) {
+
+      /* Fetch elements in groups of 512 bytes */
+      const uint8_t* const src_with_offset = src + offset_into_type;
+      for (k = 0; k < 16; k++) {
+        ymm0[k] = _mm256_loadu2_m128i(
+          (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype),
+          (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype));
+      }
+      /* Transpose bytes */
+      for (k = 0, l = 0; k < 8; k++, l +=2) {
+        ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]);
+        ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]);
+      }
+      /* Transpose words */
+      for (k = 0, l = -2; k < 8; k++, l++) {
+        if ((k%2) == 0) l += 2;
+        ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]);
+        ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]);
+      }
+      /* Transpose double words */
+      for (k = 0, l = -4; k < 8; k++, l++) {
+        if ((k%4) == 0) l += 4;
+        ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]);
+        ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]);
+      }
+      /* Transpose quad words */
+      for (k = 0; k < 8; k++) {
+        ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]);
+        ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]);
+      }
+      for (k = 0; k < 16; k++) {
+        ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
+        ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
+      }
+      /* Store the result vectors */
+      uint8_t* const dest_for_jth_element = dest + j;
+      for (k = 0; k < 16; k++) {
+        _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]);
+      }
+    }
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
+static void
+unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 2;
+  size_t i;
+  int j;
+  __m256i ymm0[2], ymm1[2];
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
+    /* Load 32 elements (64 bytes) into 2 YMM registers. */
+    const uint8_t* const src_for_ith_element = src + i;
+    for (j = 0; j < 2; j++) {
+      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
+    }
+    /* Shuffle bytes */
+    for (j = 0; j < 2; j++) {
+      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
+    }
+    /* Compute the low 64 bytes */
+    ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
+    /* Compute the hi 64 bytes */
+    ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
+    /* Store the result vectors in proper order */
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
+static void
+unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 4;
+  size_t i;
+  int j;
+  __m256i ymm0[4], ymm1[4];
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
+    /* Load 32 elements (128 bytes) into 4 YMM registers. */
+    const uint8_t* const src_for_ith_element = src + i;
+    for (j = 0; j < 4; j++) {
+      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
+    }
+    /* Shuffle bytes */
+    for (j = 0; j < 2; j++) {
+      /* Compute the low 64 bytes */
+      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
+      /* Compute the hi 64 bytes */
+      ymm1[2+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
+    }
+    /* Shuffle 2-byte words */
+    for (j = 0; j < 2; j++) {
+      /* Compute the low 64 bytes */
+      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
+      /* Compute the hi 64 bytes */
+      ymm0[2+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
+    }
+    ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
+    ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
+    ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
+    ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);
+
+    /* Store the result vectors in proper order */
+    for (j = 0; j < 4; j++) {
+      _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
+    }
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
+static void
+unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 8;
+  size_t i;
+  int j;
+  __m256i ymm0[8], ymm1[8];
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
+    /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
+    const uint8_t* const src_for_ith_element = src + i;
+    for (j = 0; j < 8; j++) {
+      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
+    }
+    /* Shuffle bytes */
+    for (j = 0; j < 4; j++) {
+      /* Compute the low 32 bytes */
+      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
+      /* Compute the hi 32 bytes */
+      ymm1[4+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
+    }
+    /* Shuffle words */
+    for (j = 0; j < 4; j++) {
+      /* Compute the low 32 bytes */
+      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
+      /* Compute the hi 32 bytes */
+      ymm0[4+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
+    }
+    for (j = 0; j < 8; j++) {
+      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
+    }
+
+    /* Shuffle 4-byte dwords */
+    for (j = 0; j < 4; j++) {
+      /* Compute the low 32 bytes */
+      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
+      /* Compute the hi 32 bytes */
+      ymm1[4+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
+    }
+
+    /* Store the result vectors in proper order */
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
+static void
+unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 16;
+  size_t i;
+  int j;
+  __m256i ymm0[16], ymm1[16];
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
+    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
+    const uint8_t* const src_for_ith_element = src + i;
+    for (j = 0; j < 16; j++) {
+      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
+    }
+
+    /* Shuffle bytes */
+    for (j = 0; j < 8; j++) {
+      /* Compute the low 32 bytes */
+      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
+      /* Compute the hi 32 bytes */
+      ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
+    }
+    /* Shuffle 2-byte words */
+    for (j = 0; j < 8; j++) {
+      /* Compute the low 32 bytes */
+      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
+      /* Compute the hi 32 bytes */
+      ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
+    }
+    /* Shuffle 4-byte dwords */
+    for (j = 0; j < 8; j++) {
+      /* Compute the low 32 bytes */
+      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
+      /* Compute the hi 32 bytes */
+      ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
+    }
+
+    /* Shuffle 8-byte qwords */
+    for (j = 0; j < 8; j++) {
+      /* Compute the low 32 bytes */
+      ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
+      /* Compute the hi 32 bytes */
+      ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
+    }
+
+    for (j = 0; j < 8; j++) {
+      ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
+      ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
+    }
+
+    /* Store the result vectors in proper order */
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
+    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
+static void
+unshuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
+{
+  size_t i;
+  int j;
+  __m256i ymm0[16], ymm1[16];
+
+  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
+
+  /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
+     to optimize cache utilization. */
+  size_t offset_into_type;
+  for (offset_into_type = 0; offset_into_type < bytesoftype;
+    offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) {
+    for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
+      /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
+      const uint8_t* const src_for_ith_element = src + i;
+      for (j = 0; j < 16; j++) {
+        ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
+      }
+
+      /* Shuffle bytes */
+      for (j = 0; j < 8; j++) {
+        /* Compute the low 32 bytes */
+        ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
+        /* Compute the hi 32 bytes */
+        ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
+      }
+      /* Shuffle 2-byte words */
+      for (j = 0; j < 8; j++) {
+        /* Compute the low 32 bytes */
+        ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
+        /* Compute the hi 32 bytes */
+        ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
+      }
+      /* Shuffle 4-byte dwords */
+      for (j = 0; j < 8; j++) {
+        /* Compute the low 32 bytes */
+        ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
+        /* Compute the hi 32 bytes */
+        ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
+      }
+
+      /* Shuffle 8-byte qwords */
+      for (j = 0; j < 8; j++) {
+        /* Compute the low 32 bytes */
+        ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
+        /* Compute the hi 32 bytes */
+        ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
+      }
+
+      for (j = 0; j < 8; j++) {
+        ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
+        ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
+      }
+
+      /* Store the result vectors in proper order */
+      const uint8_t* const dest_with_offset = dest + offset_into_type;
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
+      _mm256_storeu2_m128i(
+        (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
+        (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
+    }
+  }
+}
+
+/* Shuffle a block.  This can never fail. */
+void
+shuffle_avx2(const size_t bytesoftype, const size_t blocksize,
+             const uint8_t* const _src, uint8_t* const _dest) {
+  const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
+
+  /* If the block size is too small to be vectorized,
+     use the generic implementation. */
+  if (blocksize < vectorized_chunk_size) {
+    shuffle_generic(bytesoftype, blocksize, _src, _dest);
+    return;
+  }
+
+  /* If the blocksize is not a multiple of both the typesize and
+     the vector size, round the blocksize down to the next value
+     which is a multiple of both. The vectorized shuffle can be
+     used for that portion of the data, and the naive implementation
+     can be used for the remaining portion. */
+  const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
+
+  const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
+  const size_t total_elements = blocksize / bytesoftype;
+
+  /* Optimized shuffle implementations */
+  switch (bytesoftype)
+  {
+  case 2:
+    shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 4:
+    shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 8:
+    shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 16:
+    shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  default:
+    /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */
+    if (bytesoftype > sizeof(__m128i)) {
+      shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
+    }
+    else {
+      /* Non-optimized shuffle */
+      shuffle_generic(bytesoftype, blocksize, _src, _dest);
+      /* The non-optimized function covers the whole buffer,
+         so we're done processing here. */
+      return;
+    }
+  }
+
+  /* If the buffer had any bytes at the end which couldn't be handled
+     by the vectorized implementations, use the non-optimized version
+     to finish them up. */
+  if (vectorizable_bytes < blocksize) {
+    shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
+  }
+}
+
+/* Unshuffle a block.  This can never fail. */
+void
+unshuffle_avx2(const size_t bytesoftype, const size_t blocksize,
+               const uint8_t* const _src, uint8_t* const _dest) {
+  const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
+
+  /* If the block size is too small to be vectorized,
+     use the generic implementation. */
+  if (blocksize < vectorized_chunk_size) {
+    unshuffle_generic(bytesoftype, blocksize, _src, _dest);
+    return;
+  }
+
+  /* If the blocksize is not a multiple of both the typesize and
+     the vector size, round the blocksize down to the next value
+     which is a multiple of both. The vectorized unshuffle can be
+     used for that portion of the data, and the naive implementation
+     can be used for the remaining portion. */
+  const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
+
+  const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
+  const size_t total_elements = blocksize / bytesoftype;
+
+  /* Optimized unshuffle implementations */
+  switch (bytesoftype)
+  {
+  case 2:
+    unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 4:
+    unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 8:
+    unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 16:
+    unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  default:
+    /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
+    if (bytesoftype > sizeof(__m128i)) {
+      unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
+    }
+    else {
+      /* Non-optimized unshuffle */
+      unshuffle_generic(bytesoftype, blocksize, _src, _dest);
+      /* The non-optimized function covers the whole buffer,
+         so we're done processing here. */
+      return;
+    }
+  }
+
+  /* If the buffer had any bytes at the end which couldn't be handled
+     by the vectorized implementations, use the non-optimized version
+     to finish them up. */
+  if (vectorizable_bytes < blocksize) {
+    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
+  }
+}
diff --git a/c-blosc/blosc/shuffle-avx2.h b/c-blosc/blosc/shuffle-avx2.h
new file mode 100644
index 000000000..b90d752fb
--- /dev/null
+++ b/c-blosc/blosc/shuffle-avx2.h
@@ -0,0 +1,36 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+/* AVX2-accelerated shuffle/unshuffle routines. */
+
+#ifndef SHUFFLE_AVX2_H
+#define SHUFFLE_AVX2_H
+
+#include "shuffle-common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  AVX2-accelerated shuffle routine.
+*/
+BLOSC_NO_EXPORT void shuffle_avx2(const size_t bytesoftype, const size_t blocksize,
+                                   const uint8_t* const _src, uint8_t* const _dest);
+
+/**
+  AVX2-accelerated unshuffle routine.
+*/
+BLOSC_NO_EXPORT void unshuffle_avx2(const size_t bytesoftype, const size_t blocksize,
+                                     const uint8_t* const _src, uint8_t* const _dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SHUFFLE_AVX2_H */
diff --git a/c-blosc/blosc/shuffle-common.h b/c-blosc/blosc/shuffle-common.h
new file mode 100644
index 000000000..3dacd6ed9
--- /dev/null
+++ b/c-blosc/blosc/shuffle-common.h
@@ -0,0 +1,34 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#ifndef SHUFFLE_COMMON_H
+#define SHUFFLE_COMMON_H
+
+#include "blosc-export.h"
+
+/* Define the __SSE2__ symbol if compiling with Visual C++ and
+   targeting the minimum architecture level supporting SSE2.
+   Other compilers define this as expected and emit warnings
+   when it is re-defined. */
+#if !defined(__SSE2__) && defined(_MSC_VER) && \
+    (defined(_M_X64) || (defined(_M_IX86) && _M_IX86_FP >= 2))
+  #define __SSE2__
+#endif
+
+/* Import standard integer type definitions */
+#if defined(_WIN32) && !defined(__MINGW32__)
+  #include <windows.h>
+  #include "win32/stdint-windows.h"
+#else
+  #include <stdint.h>
+  #include <stddef.h>
+  #include <inttypes.h>
+  #include <string.h>
+#endif  /* _WIN32 */
+
+#endif  /* SHUFFLE_COMMON_H */
diff --git a/c-blosc/blosc/shuffle-generic.c b/c-blosc/blosc/shuffle-generic.c
new file mode 100644
index 000000000..46c6e8311
--- /dev/null
+++ b/c-blosc/blosc/shuffle-generic.c
@@ -0,0 +1,25 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "shuffle-generic.h"
+
+/* Shuffle a block.  This can never fail. */
+void shuffle_generic(const size_t bytesoftype, const size_t blocksize,
+		     const uint8_t* const _src, uint8_t* const _dest)
+{
+  /* Non-optimized shuffle */
+  shuffle_generic_inline(bytesoftype, 0, blocksize, _src, _dest);
+}
+
+/* Unshuffle a block.  This can never fail. */
+void unshuffle_generic(const size_t bytesoftype, const size_t blocksize,
+                       const uint8_t* const _src, uint8_t* const _dest)
+{
+  /* Non-optimized unshuffle */
+  unshuffle_generic_inline(bytesoftype, 0, blocksize, _src, _dest);
+}
diff --git a/c-blosc/blosc/shuffle-generic.h b/c-blosc/blosc/shuffle-generic.h
new file mode 100644
index 000000000..c07a24920
--- /dev/null
+++ b/c-blosc/blosc/shuffle-generic.h
@@ -0,0 +1,99 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+/* Generic (non-hardware-accelerated) shuffle/unshuffle routines.
+   These are used when hardware-accelerated functions aren't available
+   for a particular platform; they are also used by the hardware-
+   accelerated functions to handle any remaining elements in a block
+   which isn't a multiple of the hardware's vector size. */
+
+#ifndef SHUFFLE_GENERIC_H
+#define SHUFFLE_GENERIC_H
+
+#include "shuffle-common.h"
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  Generic (non-hardware-accelerated) shuffle routine.
+  This is the pure element-copying nested loop. It is used by the
+  generic shuffle implementation and also by the vectorized shuffle
+  implementations to process any remaining elements in a block which
+  is not a multiple of (type_size * vector_size).
+*/
+static void shuffle_generic_inline(const size_t type_size,
+    const size_t vectorizable_blocksize, const size_t blocksize,
+    const uint8_t* const _src, uint8_t* const _dest)
+{
+  size_t i, j;
+  /* Calculate the number of elements in the block. */
+  const size_t neblock_quot = blocksize / type_size;
+  const size_t neblock_rem = blocksize % type_size;
+  const size_t vectorizable_elements = vectorizable_blocksize / type_size;
+
+
+  /* Non-optimized shuffle */
+  for (j = 0; j < type_size; j++) {
+    for (i = vectorizable_elements; i < (size_t)neblock_quot; i++) {
+      _dest[j*neblock_quot+i] = _src[i*type_size+j];
+    }
+  }
+
+  /* Copy any leftover bytes in the block without shuffling them. */
+  memcpy(_dest + (blocksize - neblock_rem), _src + (blocksize - neblock_rem), neblock_rem);
+}
+
+/**
+  Generic (non-hardware-accelerated) unshuffle routine.
+  This is the pure element-copying nested loop. It is used by the
+  generic unshuffle implementation and also by the vectorized unshuffle
+  implementations to process any remaining elements in a block which
+  is not a multiple of (type_size * vector_size).
+*/
+static void unshuffle_generic_inline(const size_t type_size,
+  const size_t vectorizable_blocksize, const size_t blocksize,
+  const uint8_t* const _src, uint8_t* const _dest)
+{
+  size_t i, j;
+
+  /* Calculate the number of elements in the block. */
+  const size_t neblock_quot = blocksize / type_size;
+  const size_t neblock_rem = blocksize % type_size;
+  const size_t vectorizable_elements = vectorizable_blocksize / type_size;
+
+  /* Non-optimized unshuffle */
+  for (i = vectorizable_elements; i < (size_t)neblock_quot; i++) {
+    for (j = 0; j < type_size; j++) {
+      _dest[i*type_size+j] = _src[j*neblock_quot+i];
+    }
+  }
+
+  /* Copy any leftover bytes in the block without unshuffling them. */
+  memcpy(_dest + (blocksize - neblock_rem), _src + (blocksize - neblock_rem), neblock_rem);
+}
+
+/**
+  Generic (non-hardware-accelerated) shuffle routine.
+*/
+BLOSC_NO_EXPORT void shuffle_generic(const size_t bytesoftype, const size_t blocksize,
+                                      const uint8_t* const _src, uint8_t* const _dest);
+
+/**
+  Generic (non-hardware-accelerated) unshuffle routine.
+*/
+BLOSC_NO_EXPORT void unshuffle_generic(const size_t bytesoftype, const size_t blocksize,
+                                        const uint8_t* const _src, uint8_t* const _dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SHUFFLE_GENERIC_H */
diff --git a/c-blosc/blosc/shuffle-sse2.c b/c-blosc/blosc/shuffle-sse2.c
new file mode 100644
index 000000000..c829ffbbe
--- /dev/null
+++ b/c-blosc/blosc/shuffle-sse2.c
@@ -0,0 +1,626 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "shuffle-generic.h"
+#include "shuffle-sse2.h"
+
+/* Make sure SSE2 is available for the compilation target and compiler. */
+#if !defined(__SSE2__)
+  #error SSE2 is not supported by the target architecture/platform and/or this compiler.
+#endif
+
+#include <emmintrin.h>
+
+
+/* The next is useful for debugging purposes */
+#if 0
+#include <stdio.h>
+#include <string.h>
+
+static void printxmm(__m128i xmm0)
+{
+  uint8_t buf[16];
+
+  ((__m128i *)buf)[0] = xmm0;
+  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
+          buf[0], buf[1], buf[2], buf[3],
+          buf[4], buf[5], buf[6], buf[7],
+          buf[8], buf[9], buf[10], buf[11],
+          buf[12], buf[13], buf[14], buf[15]);
+}
+#endif
+
+
+/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
+static void
+shuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 2;
+  size_t j;
+  int k;
+  uint8_t* dest_for_jth_element;
+  __m128i xmm0[2], xmm1[2];
+
+  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
+    /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */
+    for (k = 0; k < 2; k++) {
+      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
+      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
+      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
+      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
+      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
+      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
+      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
+      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
+      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
+      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
+    }
+    /* Transpose quad words */
+    for (k = 0; k < 1; k++) {
+      xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]);
+      xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]);
+    }
+    /* Store the result vectors */
+    dest_for_jth_element = dest + j;
+    for (k = 0; k < 2; k++) {
+      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]);
+    }
+  }
+}
+
+/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
+static void
+shuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 4;
+  size_t i;
+  int j;
+  uint8_t* dest_for_ith_element;
+  __m128i xmm0[4], xmm1[4];
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
+    /* Fetch 16 elements (64 bytes) then transpose bytes and words. */
+    for (j = 0; j < 4; j++) {
+      xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i))));
+      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8);
+      xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d);
+      xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]);
+      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e);
+      xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]);
+    }
+    /* Transpose double words */
+    for (j = 0; j < 2; j++) {
+      xmm1[j*2] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
+      xmm1[j*2+1] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
+    }
+    /* Transpose quad words */
+    for (j = 0; j < 2; j++) {
+      xmm0[j*2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j+2]);
+      xmm0[j*2+1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j+2]);
+    }
+    /* Store the result vectors */
+    dest_for_ith_element = dest + i;
+    for (j = 0; j < 4; j++) {
+      _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]);
+    }
+  }
+}
+
+/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
+static void
+shuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 8;
+  size_t j;
+  int k, l;
+  uint8_t* dest_for_jth_element;
+  __m128i xmm0[8], xmm1[8];
+
+  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
+    /* Fetch 16 elements (128 bytes) then transpose bytes. */
+    for (k = 0; k < 8; k++) {
+      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
+      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
+      xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
+    }
+    /* Transpose words */
+    for (k = 0, l = 0; k < 4; k++, l +=2) {
+      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+1]);
+      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+1]);
+    }
+    /* Transpose double words */
+    for (k = 0, l = 0; k < 4; k++, l++) {
+      if (k == 2) l += 2;
+      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+2]);
+      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+2]);
+    }
+    /* Transpose quad words */
+    for (k = 0; k < 4; k++) {
+      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+4]);
+      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+4]);
+    }
+    /* Store the result vectors */
+    dest_for_jth_element = dest + j;
+    for (k = 0; k < 8; k++) {
+      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
+    }
+  }
+}
+
+/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
+static void
+shuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 16;
+  size_t j;
+  int k, l;
+  uint8_t* dest_for_jth_element;
+  __m128i xmm0[16], xmm1[16];
+
+  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
+    /* Fetch 16 elements (256 bytes). */
+    for (k = 0; k < 16; k++) {
+      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
+    }
+    /* Transpose bytes */
+    for (k = 0, l = 0; k < 8; k++, l +=2) {
+      xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
+      xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
+    }
+    /* Transpose words */
+    for (k = 0, l = -2; k < 8; k++, l++) {
+      if ((k%2) == 0) l += 2;
+      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
+      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
+    }
+    /* Transpose double words */
+    for (k = 0, l = -4; k < 8; k++, l++) {
+      if ((k%4) == 0) l += 4;
+      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
+      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
+    }
+    /* Transpose quad words */
+    for (k = 0; k < 8; k++) {
+      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
+      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
+    }
+    /* Store the result vectors */
+    dest_for_jth_element = dest + j;
+    for (k = 0; k < 16; k++) {
+      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
+    }
+  }
+}
+
+/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
+static void
+shuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
+{
+  size_t j;
+  const size_t vecs_per_el_rem = bytesoftype % sizeof(__m128i);
+  int k, l;
+  uint8_t* dest_for_jth_element;
+  __m128i xmm0[16], xmm1[16];
+
+  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
+    /* Advance the offset into the type by the vector size (in bytes), unless this is
+    the initial iteration and the type size is not a multiple of the vector size.
+    In that case, only advance by the number of bytes necessary so that the number
+    of remaining bytes in the type will be a multiple of the vector size. */
+    size_t offset_into_type;
+    for (offset_into_type = 0; offset_into_type < bytesoftype;
+      offset_into_type += (offset_into_type == 0 && vecs_per_el_rem > 0 ? vecs_per_el_rem : sizeof(__m128i))) {
+
+      /* Fetch elements in groups of 256 bytes */
+      const uint8_t* const src_with_offset = src + offset_into_type;
+      for (k = 0; k < 16; k++) {
+        xmm0[k] = _mm_loadu_si128((__m128i*)(src_with_offset + (j + k) * bytesoftype));
+      }
+      /* Transpose bytes */
+      for (k = 0, l = 0; k < 8; k++, l +=2) {
+        xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
+        xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
+      }
+      /* Transpose words */
+      for (k = 0, l = -2; k < 8; k++, l++) {
+        if ((k%2) == 0) l += 2;
+        xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
+        xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
+      }
+      /* Transpose double words */
+      for (k = 0, l = -4; k < 8; k++, l++) {
+        if ((k%4) == 0) l += 4;
+        xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
+        xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
+      }
+      /* Transpose quad words */
+      for (k = 0; k < 8; k++) {
+        xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
+        xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
+      }
+      /* Store the result vectors */
+      dest_for_jth_element = dest + j;
+      for (k = 0; k < 16; k++) {
+        _mm_storeu_si128((__m128i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), xmm0[k]);
+      }
+    }
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
+static void
+unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 2;
+  size_t i;
+  int j;
+  __m128i xmm0[2], xmm1[2];
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
+    /* Load 16 elements (32 bytes) into 2 XMM registers. */
+    const uint8_t* const src_for_ith_element = src + i;
+    for (j = 0; j < 2; j++) {
+      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
+    }
+    /* Shuffle bytes */
+    /* Compute the low 32 bytes */
+    xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]);
+    /* Compute the hi 32 bytes */
+    xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]);
+    /* Store the result vectors in proper order */
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]);
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
+static void
+unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 4;
+  size_t i;
+  int j;
+  __m128i xmm0[4], xmm1[4];
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
+    /* Load 16 elements (64 bytes) into 4 XMM registers. */
+    const uint8_t* const src_for_ith_element = src + i;
+    for (j = 0; j < 4; j++) {
+      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
+    }
+    /* Shuffle bytes */
+    for (j = 0; j < 2; j++) {
+      /* Compute the low 32 bytes */
+      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
+      /* Compute the hi 32 bytes */
+      xmm1[2+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
+    }
+    /* Shuffle 2-byte words */
+    for (j = 0; j < 2; j++) {
+      /* Compute the low 32 bytes */
+      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
+      /* Compute the hi 32 bytes */
+      xmm0[2+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
+    }
+    /* Store the result vectors in proper order */
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]);
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
+static void
+unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 8;
+  size_t i;
+  int j;
+  __m128i xmm0[8], xmm1[8];
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
+    /* Load 16 elements (128 bytes) into 8 XMM registers. */
+    const uint8_t* const src_for_ith_element = src + i;
+    for (j = 0; j < 8; j++) {
+      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
+    }
+    /* Shuffle bytes */
+    for (j = 0; j < 4; j++) {
+      /* Compute the low 32 bytes */
+      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
+      /* Compute the hi 32 bytes */
+      xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
+    }
+    /* Shuffle 2-byte words */
+    for (j = 0; j < 4; j++) {
+      /* Compute the low 32 bytes */
+      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
+      /* Compute the hi 32 bytes */
+      xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
+    }
+    /* Shuffle 4-byte dwords */
+    for (j = 0; j < 4; j++) {
+      /* Compute the low 32 bytes */
+      xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
+      /* Compute the hi 32 bytes */
+      xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
+    }
+    /* Store the result vectors in proper order */
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]);
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
+static void
+unshuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
+  const size_t vectorizable_elements, const size_t total_elements)
+{
+  static const size_t bytesoftype = 16;
+  size_t i;
+  int j;
+  __m128i xmm1[16], xmm2[16];
+
+  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
+    /* Load 16 elements (256 bytes) into 16 XMM registers. */
+    const uint8_t* const src_for_ith_element = src + i;
+    for (j = 0; j < 16; j++) {
+      xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
+    }
+    /* Shuffle bytes */
+    for (j = 0; j < 8; j++) {
+      /* Compute the low 32 bytes */
+      xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
+      /* Compute the hi 32 bytes */
+      xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
+    }
+    /* Shuffle 2-byte words */
+    for (j = 0; j < 8; j++) {
+      /* Compute the low 32 bytes */
+      xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
+      /* Compute the hi 32 bytes */
+      xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
+    }
+    /* Shuffle 4-byte dwords */
+    for (j = 0; j < 8; j++) {
+      /* Compute the low 32 bytes */
+      xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
+      /* Compute the hi 32 bytes */
+      xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
+    }
+    /* Shuffle 8-byte qwords */
+    for (j = 0; j < 8; j++) {
+      /* Compute the low 32 bytes */
+      xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
+      /* Compute the hi 32 bytes */
+      xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
+    }
+
+    /* Store the result vectors in proper order */
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[8]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[4]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[12]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[2]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[10]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[6]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[14]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * sizeof(__m128i))), xmm1[1]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * sizeof(__m128i))), xmm1[9]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * sizeof(__m128i))), xmm1[5]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * sizeof(__m128i))), xmm1[13]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * sizeof(__m128i))), xmm1[3]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * sizeof(__m128i))), xmm1[11]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * sizeof(__m128i))), xmm1[7]);
+    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (15 * sizeof(__m128i))), xmm1[15]);
+  }
+}
+
+/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
+static void
+unshuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const orig,
+  const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
+{
+  size_t i;
+  const size_t vecs_per_el_rem = bytesoftype % sizeof(__m128i);
+
+  int j;
+  uint8_t* dest_with_offset;
+  __m128i xmm1[16], xmm2[16];
+
+  /* The unshuffle loops are inverted (compared to shuffle_tiled16_sse2)
+     to optimize cache utilization. */
+  size_t offset_into_type;
+  for (offset_into_type = 0; offset_into_type < bytesoftype;
+    offset_into_type += (offset_into_type == 0 && vecs_per_el_rem > 0 ? vecs_per_el_rem : sizeof(__m128i))) {
+    for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
+      /* Load the first 128 bytes in 16 XMM registers */
+      const uint8_t* const src_for_ith_element = orig + i;
+      for (j = 0; j < 16; j++) {
+        xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
+      }
+      /* Shuffle bytes */
+      for (j = 0; j < 8; j++) {
+        /* Compute the low 32 bytes */
+        xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
+        /* Compute the hi 32 bytes */
+        xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
+      }
+      /* Shuffle 2-byte words */
+      for (j = 0; j < 8; j++) {
+        /* Compute the low 32 bytes */
+        xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
+        /* Compute the hi 32 bytes */
+        xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
+      }
+      /* Shuffle 4-byte dwords */
+      for (j = 0; j < 8; j++) {
+        /* Compute the low 32 bytes */
+        xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
+        /* Compute the hi 32 bytes */
+        xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
+      }
+      /* Shuffle 8-byte qwords */
+      for (j = 0; j < 8; j++) {
+        /* Compute the low 32 bytes */
+        xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
+        /* Compute the hi 32 bytes */
+        xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
+      }
+
+      /* Store the result vectors in proper order */
+      dest_with_offset = dest + offset_into_type;
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 0) * bytesoftype), xmm1[0]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 1) * bytesoftype), xmm1[8]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 2) * bytesoftype), xmm1[4]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 3) * bytesoftype), xmm1[12]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 4) * bytesoftype), xmm1[2]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 5) * bytesoftype), xmm1[10]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 6) * bytesoftype), xmm1[6]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 7) * bytesoftype), xmm1[14]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 8) * bytesoftype), xmm1[1]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 9) * bytesoftype), xmm1[9]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 10) * bytesoftype), xmm1[5]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 11) * bytesoftype), xmm1[13]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 12) * bytesoftype), xmm1[3]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 13) * bytesoftype), xmm1[11]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 14) * bytesoftype), xmm1[7]);
+      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 15) * bytesoftype), xmm1[15]);
+    }
+  }
+}
+
+/* Shuffle a block.  This can never fail. */
+void
+shuffle_sse2(const size_t bytesoftype, const size_t blocksize,
+             const uint8_t* const _src, uint8_t* const _dest) {
+  const size_t vectorized_chunk_size = bytesoftype * sizeof(__m128i);
+  /* If the blocksize is not a multiple of both the typesize and
+     the vector size, round the blocksize down to the next value
+     which is a multiple of both. The vectorized shuffle can be
+     used for that portion of the data, and the naive implementation
+     can be used for the remaining portion. */
+  const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
+  const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
+  const size_t total_elements = blocksize / bytesoftype;
+
+  /* If the block size is too small to be vectorized,
+     use the generic implementation. */
+  if (blocksize < vectorized_chunk_size) {
+    shuffle_generic(bytesoftype, blocksize, _src, _dest);
+    return;
+  }
+
+  /* Optimized shuffle implementations */
+  switch (bytesoftype)
+  {
+  case 2:
+    shuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 4:
+    shuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 8:
+    shuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 16:
+    shuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  default:
+    if (bytesoftype > sizeof(__m128i)) {
+      shuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
+    }
+    else {
+      /* Non-optimized shuffle */
+      shuffle_generic(bytesoftype, blocksize, _src, _dest);
+      /* The non-optimized function covers the whole buffer,
+         so we're done processing here. */
+      return;
+    }
+  }
+
+  /* If the buffer had any bytes at the end which couldn't be handled
+     by the vectorized implementations, use the non-optimized version
+     to finish them up. */
+  if (vectorizable_bytes < blocksize) {
+    shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
+  }
+}
+
+/* Unshuffle a block.  This can never fail. */
+void
+unshuffle_sse2(const size_t bytesoftype, const size_t blocksize,
+               const uint8_t* const _src, uint8_t* const _dest) {
+  const size_t vectorized_chunk_size = bytesoftype * sizeof(__m128i);
+  /* If the blocksize is not a multiple of both the typesize and
+     the vector size, round the blocksize down to the next value
+     which is a multiple of both. The vectorized unshuffle can be
+     used for that portion of the data, and the naive implementation
+     can be used for the remaining portion. */
+  const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
+  const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
+  const size_t total_elements = blocksize / bytesoftype;
+
+
+  /* If the block size is too small to be vectorized,
+     use the generic implementation. */
+  if (blocksize < vectorized_chunk_size) {
+    unshuffle_generic(bytesoftype, blocksize, _src, _dest);
+    return;
+  }
+
+  /* Optimized unshuffle implementations */
+  switch (bytesoftype)
+  {
+  case 2:
+    unshuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 4:
+    unshuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 8:
+    unshuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  case 16:
+    unshuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
+    break;
+  default:
+    if (bytesoftype > sizeof(__m128i)) {
+      unshuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
+    }
+    else {
+      /* Non-optimized unshuffle */
+      unshuffle_generic(bytesoftype, blocksize, _src, _dest);
+      /* The non-optimized function covers the whole buffer,
+         so we're done processing here. */
+      return;
+    }
+  }
+
+  /* If the buffer had any bytes at the end which couldn't be handled
+     by the vectorized implementations, use the non-optimized version
+     to finish them up. */
+  if (vectorizable_bytes < blocksize) {
+    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
+  }
+}
diff --git a/c-blosc/blosc/shuffle-sse2.h b/c-blosc/blosc/shuffle-sse2.h
new file mode 100644
index 000000000..6e9d53a74
--- /dev/null
+++ b/c-blosc/blosc/shuffle-sse2.h
@@ -0,0 +1,36 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+/* SSE2-accelerated shuffle/unshuffle routines. */
+   
+#ifndef SHUFFLE_SSE2_H
+#define SHUFFLE_SSE2_H
+
+#include "shuffle-common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  SSE2-accelerated shuffle routine.
+*/
+BLOSC_NO_EXPORT void shuffle_sse2(const size_t bytesoftype, const size_t blocksize,
+                                   const uint8_t* const _src, uint8_t* const _dest);
+
+/**
+  SSE2-accelerated unshuffle routine.
+*/
+BLOSC_NO_EXPORT void unshuffle_sse2(const size_t bytesoftype, const size_t blocksize,
+                                     const uint8_t* const _src, uint8_t* const _dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SHUFFLE_SSE2_H */
diff --git a/c-blosc/blosc/shuffle.c b/c-blosc/blosc/shuffle.c
index a19bc803a..e8983bd6e 100644
--- a/c-blosc/blosc/shuffle.c
+++ b/c-blosc/blosc/shuffle.c
@@ -1,502 +1,445 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <francesc@blosc.io>
+  Author: Francesc Alted <francesc@blosc.org>
   Creation date: 2009-05-20
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
 
+#include "shuffle.h"
+#include "shuffle-common.h"
+#include "shuffle-generic.h"
+#include "bitshuffle-generic.h"
 #include <stdio.h>
 #include <string.h>
-#include "shuffle.h"
 
-#if defined(_WIN32) && !defined(__MINGW32__)
-  #include <windows.h>
-  #include "win32/stdint-windows.h"
-  #define __SSE2__          /* Windows does not define this by default */
+/* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */
+#if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
+/* have a C99 compiler */
+typedef _Bool bool;
 #else
-  #include <stdint.h>
-  #include <inttypes.h>
-#endif  /* _WIN32 */
-
+/* do not have a C99 compiler */
+typedef unsigned char bool;
+#endif
+static const bool false = 0;
+static const bool true = 1;
 
-/* The non-SSE2 versions of shuffle and unshuffle */
 
-/* Shuffle a block.  This can never fail. */
-static void _shuffle(size_t bytesoftype, size_t blocksize,
-	                 uint8_t* _src, uint8_t* _dest)
-{
-  size_t i, j, neblock, leftover;
+#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+    __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+#define HAVE_CPU_FEAT_INTRIN
+#endif
 
-  /* Non-optimized shuffle */
-  neblock = blocksize / bytesoftype;  /* Number of elements in a block */
-  for (j = 0; j < bytesoftype; j++) {
-    for (i = 0; i < neblock; i++) {
-      _dest[j*neblock+i] = _src[i*bytesoftype+j];
-    }
+/*  Include hardware-accelerated shuffle/unshuffle routines based on
+    the target architecture. Note that a target architecture may support
+    more than one type of acceleration!*/
+#if defined(SHUFFLE_AVX2_ENABLED)
+  #include "shuffle-avx2.h"
+  #include "bitshuffle-avx2.h"
+#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
+
+#if defined(SHUFFLE_SSE2_ENABLED)
+  #include "shuffle-sse2.h"
+  #include "bitshuffle-sse2.h"
+#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
+
+
+/*  Define function pointer types for shuffle/unshuffle routines. */
+typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
+typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
+typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
+typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
+
+/* An implementation of shuffle/unshuffle routines. */
+typedef struct shuffle_implementation {
+  /* Name of this implementation. */
+  const char* name;
+  /* Function pointer to the shuffle routine for this implementation. */
+  shuffle_func shuffle;
+  /* Function pointer to the unshuffle routine for this implementation. */
+  unshuffle_func unshuffle;
+  /* Function pointer to the bitshuffle routine for this implementation. */
+  bitshuffle_func bitshuffle;
+  /* Function pointer to the bitunshuffle routine for this implementation. */
+  bitunshuffle_func bitunshuffle;
+} shuffle_implementation_t;
+
+typedef enum {
+  BLOSC_HAVE_NOTHING = 0,
+  BLOSC_HAVE_SSE2 = 1,
+  BLOSC_HAVE_AVX2 = 2
+} blosc_cpu_features;
+
+/*  Detect hardware and set function pointers to the best shuffle/unshuffle
+    implementations supported by the host processor. */
+#if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED)    /* Intel/i686 */
+
+/*  Disabled the __builtin_cpu_supports() call, as it has issues with
+    new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
+      "undefined symbol: __cpu_model"
+    For a similar report, see:
+    https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
+*/
+#if defined(HAVE_CPU_FEAT_INTRIN) && 0
+static blosc_cpu_features blosc_get_cpu_features(void) {
+  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
+  if (__builtin_cpu_supports("sse2")) {
+    cpu_features |= BLOSC_HAVE_SSE2;
   }
-  leftover = blocksize % bytesoftype;
-  memcpy(_dest + neblock*bytesoftype, _src + neblock*bytesoftype, leftover);
-}
-
-/* Unshuffle a block.  This can never fail. */
-static void _unshuffle(size_t bytesoftype, size_t blocksize,
-                       uint8_t* _src, uint8_t* _dest)
-{
-  size_t i, j, neblock, leftover;
-
-  /* Non-optimized unshuffle */
-  neblock = blocksize / bytesoftype;  /* Number of elements in a block */
-  for (i = 0; i < neblock; i++) {
-    for (j = 0; j < bytesoftype; j++) {
-      _dest[i*bytesoftype+j] = _src[j*neblock+i];
-    }
+  if (__builtin_cpu_supports("avx2")) {
+    cpu_features |= BLOSC_HAVE_AVX2;
   }
-  leftover = blocksize % bytesoftype;
-  memcpy(_dest+neblock*bytesoftype, _src+neblock*bytesoftype, leftover);
+  return cpu_features;
 }
+#else
 
+#if defined(_MSC_VER) && !defined(__clang__)
+  #include <intrin.h>     /* Needed for __cpuid */
 
-#ifdef __SSE2__
-
-/* The SSE2 versions of shuffle and unshuffle */
-
-#include <emmintrin.h>
-
-/* The next is useful for debugging purposes */
-#if 0
-static void printxmm(__m128i xmm0)
-{
-  uint8_t buf[16];
-
-  ((__m128i *)buf)[0] = xmm0;
-  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
-          buf[0], buf[1], buf[2], buf[3],
-          buf[4], buf[5], buf[6], buf[7],
-          buf[8], buf[9], buf[10], buf[11],
-          buf[12], buf[13], buf[14], buf[15]);
-}
-#endif
+/*  _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
+#if _MSC_FULL_VER >= 160040219
+  #include <immintrin.h>  /* Needed for _xgetbv */
+#elif defined(_M_IX86)
 
+/*  Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
 
-/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
-static void
-shuffle2(uint8_t* dest, uint8_t* src, size_t size)
-{
-  size_t i, j, k;
-  size_t numof16belem;
-  __m128i xmm0[2], xmm1[2];
-
-  numof16belem = size / (16*2);
-  for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) {
-    /* Fetch and transpose bytes, words and double words in groups of
-       32 bytes */
-    for (k = 0; k < 2; k++) {
-      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
-      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
-      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
-      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
-      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
-      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
-      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
-      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
-    }
-    /* Transpose quad words */
-    for (k = 0; k < 1; k++) {
-      xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]);
-      xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]);
-    }
-    /* Store the result vectors */
-    for (k = 0; k < 2; k++) {
-      ((__m128i *)dest)[k*numof16belem+i] = xmm1[k];
+static uint64_t _xgetbv(uint32_t xcr) {
+    uint32_t xcr0, xcr1;
+    __asm {
+        mov        ecx, xcr
+        _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
+        mov        xcr0, eax
+        mov        xcr1, edx
     }
-  }
+    return ((uint64_t)xcr1 << 32) | xcr0;
 }
 
+#elif defined(_M_X64)
 
-/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
-static void
-shuffle4(uint8_t* dest, uint8_t* src, size_t size)
-{
-  size_t i, j, k;
-  size_t numof16belem;
-  __m128i xmm0[4], xmm1[4];
-
-  numof16belem = size / (16*4);
-  for (i = 0, j = 0; i < numof16belem; i++, j += 16*4) {
-    /* Fetch and transpose bytes and words in groups of 64 bytes */
-    for (k = 0; k < 4; k++) {
-      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
-      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0x8d);
-      xmm0[k] = _mm_unpacklo_epi8(xmm1[k], xmm0[k]);
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x04e);
-      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
-    }
-    /* Transpose double words */
-    for (k = 0; k < 2; k++) {
-      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[k*2], xmm0[k*2+1]);
-      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[k*2], xmm0[k*2+1]);
-    }
-    /* Transpose quad words */
-    for (k = 0; k < 2; k++) {
-      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+2]);
-      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+2]);
-    }
-    /* Store the result vectors */
-    for (k = 0; k < 4; k++) {
-      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
-    }
-  }
+/*  Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
+    These compilers don't support any of the newer acceleration ISAs
+    (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
+    which means we can get away with returning a hard-coded value from
+    this implementation of _xgetbv. */
+
+static inline uint64_t
+_xgetbv(uint32_t xcr) {
+    /* A 64-bit OS must have XMM save support. */
+    return xcr == 0 ? (1UL << 1) : 0UL;
 }
 
+#else
 
-/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
-static void
-shuffle8(uint8_t* dest, uint8_t* src, size_t size)
-{
-  size_t i, j, k, l;
-  size_t numof16belem;
-  __m128i xmm0[8], xmm1[8];
-
-  numof16belem = size / (16*8);
-  for (i = 0, j = 0; i < numof16belem; i++, j += 16*8) {
-    /* Fetch and transpose bytes in groups of 128 bytes */
-    for (k = 0; k < 8; k++) {
-      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
-      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
-      xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
-    }
-    /* Transpose words */
-    for (k = 0, l = 0; k < 4; k++, l +=2) {
-      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+1]);
-      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+1]);
-    }
-    /* Transpose double words */
-    for (k = 0, l = 0; k < 4; k++, l++) {
-      if (k == 2) l += 2;
-      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+2]);
-      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+2]);
-    }
-    /* Transpose quad words */
-    for (k = 0; k < 4; k++) {
-      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+4]);
-      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+4]);
-    }
-    /* Store the result vectors */
-    for (k = 0; k < 8; k++) {
-      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
-    }
-  }
-}
+/* Hardware detection for any other MSVC targets (e.g., ARM)
+   isn't implemented at this time. */
+#error This version of c-blosc only supports x86 and x64 targets with MSVC.
 
+#endif /* _MSC_FULL_VER >= 160040219 */
+  
+#else
 
-/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
-static void
-shuffle16(uint8_t* dest, uint8_t* src, size_t size)
-{
-  size_t i, j, k, l;
-  size_t numof16belem;
-  __m128i xmm0[16], xmm1[16];
+/*  Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
+    and others using inline assembly. */
+__attribute__((always_inline))
+static inline void
+__cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
+  __asm__ __volatile__ (
+# if defined(__i386__) && defined (__PIC__)
+  /*  Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
+      https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+  */
+    "movl %%ebx, %%edi\n\t"
+    "cpuid\n\t"
+    "xchgl %%ebx, %%edi":
+    "=D" (cpuInfo[1]),
+#else
+    "cpuid":
+    "=b" (cpuInfo[1]),
+#endif  /* defined(__i386) && defined(__PIC__) */
+    "=a" (cpuInfo[0]),
+    "=c" (cpuInfo[2]),
+    "=d" (cpuInfo[3]) :
+    "a" (function_id), "c" (subfunction_id)
+    );
+}
 
-  numof16belem = size / (16*16);
-  for (i = 0, j = 0; i < numof16belem; i++, j += 16*16) {
-    /* Fetch elements in groups of 256 bytes */
-    for (k = 0; k < 16; k++) {
-      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
-    }
-    /* Transpose bytes */
-    for (k = 0, l = 0; k < 8; k++, l +=2) {
-      xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
-      xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
-    }
-    /* Transpose words */
-    for (k = 0, l = -2; k < 8; k++, l++) {
-      if ((k%2) == 0) l += 2;
-      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
-      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
-    }
-    /* Transpose double words */
-    for (k = 0, l = -4; k < 8; k++, l++) {
-      if ((k%4) == 0) l += 4;
-      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
-      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
-    }
-    /* Transpose quad words */
-    for (k = 0; k < 8; k++) {
-      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
-      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
-    }
-    /* Store the result vectors */
-    for (k = 0; k < 16; k++) {
-      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
-    }
-  }
+#define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0)
+
+#define _XCR_XFEATURE_ENABLED_MASK 0
+
+/* Reads the content of an extended control register.
+   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+*/
+static inline uint64_t
+_xgetbv(uint32_t xcr) {
+  uint32_t eax, edx;
+  __asm__ __volatile__ (
+    /* "xgetbv"
+       This is specified as raw instruction bytes due to some older compilers
+       having issues with the mnemonic form.
+    */
+    ".byte 0x0f, 0x01, 0xd0":
+    "=a" (eax),
+    "=d" (edx) :
+    "c" (xcr)
+    );
+  return ((uint64_t)edx << 32) | eax;
 }
 
+#endif /* defined(_MSC_FULL_VER) */
 
-/* Shuffle a block.  This can never fail. */
-void shuffle(size_t bytesoftype, size_t blocksize,
-             uint8_t* _src, uint8_t* _dest) {
-  int unaligned_dest = (int)((uintptr_t)_dest % 16);
-  int multiple_of_block = (blocksize % (16 * bytesoftype)) == 0;
-  int too_small = (blocksize < 256);
+#ifndef _XCR_XFEATURE_ENABLED_MASK
+#define _XCR_XFEATURE_ENABLED_MASK 0x0
+#endif
 
-  if (unaligned_dest || !multiple_of_block || too_small) {
-    /* _dest buffer is not aligned, not multiple of the vectorization size
-     * or is too small.  Call the non-sse2 version. */
-    _shuffle(bytesoftype, blocksize, _src, _dest);
-    return;
+static blosc_cpu_features blosc_get_cpu_features(void) {
+  blosc_cpu_features result = BLOSC_HAVE_NOTHING;
+  int32_t max_basic_function_id;
+  /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
+  int32_t cpu_info[4];
+  int sse2_available;
+  int sse3_available;
+  int ssse3_available;
+  int sse41_available;
+  int sse42_available;
+  int xsave_available;
+  int xsave_enabled_by_os;
+  int avx2_available = 0;
+  int avx512bw_available = 0;
+  int xmm_state_enabled = 0;
+  int ymm_state_enabled = 0;
+  int zmm_state_enabled = 0;
+  uint64_t xcr0_contents;
+
+  /* Get the number of basic functions available. */
+  __cpuid(cpu_info, 0);
+  max_basic_function_id = cpu_info[0];
+
+  /* Check for SSE-based features and required OS support */
+  __cpuid(cpu_info, 1);
+  sse2_available = (cpu_info[3] & (1 << 26)) != 0;
+  sse3_available = (cpu_info[2] & (1 << 0)) != 0;
+  ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
+  sse41_available = (cpu_info[2] & (1 << 19)) != 0;
+  sse42_available = (cpu_info[2] & (1 << 20)) != 0;
+
+  xsave_available = (cpu_info[2] & (1 << 26)) != 0;
+  xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
+
+  /* Check for AVX-based features, if the processor supports extended features. */
+  if (max_basic_function_id >= 7) {
+    __cpuid(cpu_info, 7);
+    avx2_available = (cpu_info[1] & (1 << 5)) != 0;
+    avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
   }
 
-  /* Optimized shuffle */
-  /* The buffer must be aligned on a 16 bytes boundary, have a power */
-  /* of 2 size and be larger or equal than 256 bytes. */
-  if (bytesoftype == 4) {
-    shuffle4(_dest, _src, blocksize);
-  }
-  else if (bytesoftype == 8) {
-    shuffle8(_dest, _src, blocksize);
+  /*  Even if certain features are supported by the CPU, they may not be supported
+      by the OS (in which case using them would crash the process or system).
+      If xsave is available and enabled by the OS, check the contents of the
+      extended control register XCR0 to see if the CPU features are enabled. */
+#if defined(_XCR_XFEATURE_ENABLED_MASK)
+  if (xsave_available && xsave_enabled_by_os && (
+      sse2_available || sse3_available || ssse3_available
+      || sse41_available || sse42_available
+      || avx2_available || avx512bw_available)) {
+    /* Determine which register states can be restored by the OS. */
+    xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+
+    xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
+    ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
+
+    /*  Require support for both the upper 256-bits of zmm0-zmm15 to be
+        restored as well as all of zmm16-zmm31 and the opmask registers. */
+    zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
   }
-  else if (bytesoftype == 16) {
-    shuffle16(_dest, _src, blocksize);
+#endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
+
+#if defined(BLOSC_DUMP_CPU_INFO)
+  printf("Shuffle CPU Information:\n");
+  printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
+  printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
+  printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
+  printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
+  printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
+  printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
+  printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
+  printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
+  printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
+  printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
+  printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
+  printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
+#endif /* defined(BLOSC_DUMP_CPU_INFO) */
+
+  /* Using the gathered CPU information, determine which implementation to use. */
+  /* technically could fail on sse2 cpu on os without xmm support, but that
+   * shouldn't exist anymore */
+  if (sse2_available) {
+    result |= BLOSC_HAVE_SSE2;
   }
-  else if (bytesoftype == 2) {
-    shuffle2(_dest, _src, blocksize);
-  }
-  else {
-    /* Non-optimized shuffle */
-    _shuffle(bytesoftype, blocksize, _src, _dest);
+  if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
+    result |= BLOSC_HAVE_AVX2;
   }
+  return result;
 }
+#endif
 
+#else   /* No hardware acceleration supported for the target architecture. */
+  #if defined(_MSC_VER)
+  #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
+  #else
+  #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
+  #endif
 
-/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
-static void
-unshuffle2(uint8_t* dest, uint8_t* orig, size_t size)
-{
-  size_t i, k;
-  size_t neblock, numof16belem;
-  __m128i xmm1[2], xmm2[2];
-
-  neblock = size / 2;
-  numof16belem = neblock / 16;
-  for (i = 0, k = 0; i < numof16belem; i++, k += 2) {
-    /* Load the first 32 bytes in 2 XMM registrers */
-    xmm1[0] = ((__m128i *)orig)[0*numof16belem+i];
-    xmm1[1] = ((__m128i *)orig)[1*numof16belem+i];
-    /* Shuffle bytes */
-    /* Compute the low 32 bytes */
-    xmm2[0] = _mm_unpacklo_epi8(xmm1[0], xmm1[1]);
-    /* Compute the hi 32 bytes */
-    xmm2[1] = _mm_unpackhi_epi8(xmm1[0], xmm1[1]);
-    /* Store the result vectors in proper order */
-    ((__m128i *)dest)[k+0] = xmm2[0];
-    ((__m128i *)dest)[k+1] = xmm2[1];
-  }
+static blosc_cpu_features blosc_get_cpu_features(void) {
+  return BLOSC_HAVE_NOTHING;
 }
 
+#endif
 
-/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
-static void
-unshuffle4(uint8_t* dest, uint8_t* orig, size_t size)
-{
-  size_t i, j, k;
-  size_t neblock, numof16belem;
-  __m128i xmm0[4], xmm1[4];
-
-  neblock = size / 4;
-  numof16belem = neblock / 16;
-  for (i = 0, k = 0; i < numof16belem; i++, k += 4) {
-    /* Load the first 64 bytes in 4 XMM registrers */
-    for (j = 0; j < 4; j++) {
-      xmm0[j] = ((__m128i *)orig)[j*numof16belem+i];
-    }
-    /* Shuffle bytes */
-    for (j = 0; j < 2; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[2+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
-    }
-    /* Shuffle 2-byte words */
-    for (j = 0; j < 2; j++) {
-      /* Compute the low 32 bytes */
-      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm0[2+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
-    }
-    /* Store the result vectors in proper order */
-    ((__m128i *)dest)[k+0] = xmm0[0];
-    ((__m128i *)dest)[k+1] = xmm0[2];
-    ((__m128i *)dest)[k+2] = xmm0[1];
-    ((__m128i *)dest)[k+3] = xmm0[3];
+static shuffle_implementation_t get_shuffle_implementation() {
+  blosc_cpu_features cpu_features = blosc_get_cpu_features();
+  shuffle_implementation_t impl_generic;
+
+#if defined(SHUFFLE_AVX2_ENABLED)
+  if (cpu_features & BLOSC_HAVE_AVX2) {
+    shuffle_implementation_t impl_avx2;
+    impl_avx2.name = "avx2";
+    impl_avx2.shuffle = (shuffle_func)shuffle_avx2;
+    impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2;
+    impl_avx2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_avx2;
+    impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_avx2;
+    return impl_avx2;
   }
-}
-
-
-/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
-static void
-unshuffle8(uint8_t* dest, uint8_t* orig, size_t size)
-{
-  size_t i, j, k;
-  size_t neblock, numof16belem;
-  __m128i xmm0[8], xmm1[8];
-
-  neblock = size / 8;
-  numof16belem = neblock / 16;
-  for (i = 0, k = 0; i < numof16belem; i++, k += 8) {
-    /* Load the first 64 bytes in 8 XMM registrers */
-    for (j = 0; j < 8; j++) {
-      xmm0[j] = ((__m128i *)orig)[j*numof16belem+i];
-    }
-    /* Shuffle bytes */
-    for (j = 0; j < 4; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
-    }
-    /* Shuffle 2-byte words */
-    for (j = 0; j < 4; j++) {
-      /* Compute the low 32 bytes */
-      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
-    }
-    /* Shuffle 4-byte dwords */
-    for (j = 0; j < 4; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
-    }
-    /* Store the result vectors in proper order */
-    ((__m128i *)dest)[k+0] = xmm1[0];
-    ((__m128i *)dest)[k+1] = xmm1[4];
-    ((__m128i *)dest)[k+2] = xmm1[2];
-    ((__m128i *)dest)[k+3] = xmm1[6];
-    ((__m128i *)dest)[k+4] = xmm1[1];
-    ((__m128i *)dest)[k+5] = xmm1[5];
-    ((__m128i *)dest)[k+6] = xmm1[3];
-    ((__m128i *)dest)[k+7] = xmm1[7];
+#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
+
+#if defined(SHUFFLE_SSE2_ENABLED)
+  if (cpu_features & BLOSC_HAVE_SSE2) {
+    shuffle_implementation_t impl_sse2;
+    impl_sse2.name = "sse2";
+    impl_sse2.shuffle = (shuffle_func)shuffle_sse2;
+    impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2;
+    impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_sse2;
+    impl_sse2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_sse2;
+    return impl_sse2;
   }
+#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
+
+  /*  Processor doesn't support any of the hardware-accelerated implementations,
+      so use the generic implementation. */
+  impl_generic.name = "generic";
+  impl_generic.shuffle = (shuffle_func)shuffle_generic;
+  impl_generic.unshuffle = (unshuffle_func)unshuffle_generic;
+  impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
+  impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
+  return impl_generic;
 }
 
 
-/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
-static void
-unshuffle16(uint8_t* dest, uint8_t* orig, size_t size)
-{
-  size_t i, j, k;
-  size_t neblock, numof16belem;
-  __m128i xmm1[16], xmm2[16];
-
-  neblock = size / 16;
-  numof16belem = neblock / 16;
-  for (i = 0, k = 0; i < numof16belem; i++, k += 16) {
-    /* Load the first 128 bytes in 16 XMM registrers */
-    for (j = 0; j < 16; j++) {
-      xmm1[j] = ((__m128i *)orig)[j*numof16belem+i];
-    }
-    /* Shuffle bytes */
-    for (j = 0; j < 8; j++) {
-      /* Compute the low 32 bytes */
-      xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
-    }
-    /* Shuffle 2-byte words */
-    for (j = 0; j < 8; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
-    }
-    /* Shuffle 4-byte dwords */
-    for (j = 0; j < 8; j++) {
-      /* Compute the low 32 bytes */
-      xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
-    }
-    /* Shuffle 8-byte qwords */
-    for (j = 0; j < 8; j++) {
-      /* Compute the low 32 bytes */
-      xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
-      /* Compute the hi 32 bytes */
-      xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
-    }
-    /* Store the result vectors in proper order */
-    ((__m128i *)dest)[k+0] = xmm1[0];
-    ((__m128i *)dest)[k+1] = xmm1[8];
-    ((__m128i *)dest)[k+2] = xmm1[4];
-    ((__m128i *)dest)[k+3] = xmm1[12];
-    ((__m128i *)dest)[k+4] = xmm1[2];
-    ((__m128i *)dest)[k+5] = xmm1[10];
-    ((__m128i *)dest)[k+6] = xmm1[6];
-    ((__m128i *)dest)[k+7] = xmm1[14];
-    ((__m128i *)dest)[k+8] = xmm1[1];
-    ((__m128i *)dest)[k+9] = xmm1[9];
-    ((__m128i *)dest)[k+10] = xmm1[5];
-    ((__m128i *)dest)[k+11] = xmm1[13];
-    ((__m128i *)dest)[k+12] = xmm1[3];
-    ((__m128i *)dest)[k+13] = xmm1[11];
-    ((__m128i *)dest)[k+14] = xmm1[7];
-    ((__m128i *)dest)[k+15] = xmm1[15];
-  }
-}
-
+/*  Flag indicating whether the implementation has been initialized.
+    Zero means it hasn't been initialized, non-zero means it has. */
+static int32_t implementation_initialized;
 
-/* Unshuffle a block.  This can never fail. */
-void unshuffle(size_t bytesoftype, size_t blocksize,
-               uint8_t* _src, uint8_t* _dest) {
-  int unaligned_src = (int)((uintptr_t)_src % 16);
-  int unaligned_dest = (int)((uintptr_t)_dest % 16);
-  int multiple_of_block = (blocksize % (16 * bytesoftype)) == 0;
-  int too_small = (blocksize < 256);
+/*  The dynamically-chosen shuffle/unshuffle implementation.
+    This is only safe to use once `implementation_initialized` is set. */
+static shuffle_implementation_t host_implementation;
 
-  if (unaligned_src || unaligned_dest || !multiple_of_block || too_small) {
-    /* _src or _dest buffer is not aligned, not multiple of the vectorization
-     * size or is not too small.  Call the non-sse2 version. */
-    _unshuffle(bytesoftype, blocksize, _src, _dest);
-    return;
-  }
+/*  Initialize the shuffle implementation, if necessary. */
+#if defined(__GNUC__) || defined(__clang__)
+__attribute__((always_inline))
+#endif
+static
+#if defined(_MSC_VER)
+__forceinline
+#else
+inline
+#endif
+void init_shuffle_implementation() {
+  /* Initialization could (in rare cases) take place concurrently on
+     multiple threads, but it shouldn't matter because the
+     initialization should return the same result on each thread (so
+     the implementation will be the same). Since that's the case we
+     can avoid complicated synchronization here and get a small
+     performance benefit because we don't need to perform a volatile
+     load on the initialization variable each time this function is
+     called. */
+#if defined(__GNUC__) || defined(__clang__)
+  if (__builtin_expect(!implementation_initialized, 0)) {
+#else
+  if (!implementation_initialized) {
+#endif
+    /* Initialize the implementation. */
+    host_implementation = get_shuffle_implementation();
 
-  /* Optimized unshuffle */
-  /* The buffers must be aligned on a 16 bytes boundary, have a power */
-  /* of 2 size and be larger or equal than 256 bytes. */
-  if (bytesoftype == 4) {
-    unshuffle4(_dest, _src, blocksize);
-  }
-  else if (bytesoftype == 8) {
-    unshuffle8(_dest, _src, blocksize);
-  }
-  else if (bytesoftype == 16) {
-    unshuffle16(_dest, _src, blocksize);
-  }
-  else if (bytesoftype == 2) {
-    unshuffle2(_dest, _src, blocksize);
-  }
-  else {
-    /* Non-optimized unshuffle */
-    _unshuffle(bytesoftype, blocksize, _src, _dest);
+    /*  Set the flag indicating the implementation has been initialized. */
+    implementation_initialized = 1;
   }
 }
 
-#else   /* no __SSE2__ available */
+/*  Shuffle a block by dynamically dispatching to the appropriate
+    hardware-accelerated routine at run-time. */
+void
+shuffle(const size_t bytesoftype, const size_t blocksize,
+        const uint8_t* _src, const uint8_t* _dest) {
+  /* Initialize the shuffle implementation if necessary. */
+  init_shuffle_implementation();
+
+  /*  The implementation is initialized.
+      Dispatch to it's shuffle routine. */
+  (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
+}
 
-void shuffle(size_t bytesoftype, size_t blocksize,
-             uint8_t* _src, uint8_t* _dest) {
-  _shuffle(bytesoftype, blocksize, _src, _dest);
+/*  Unshuffle a block by dynamically dispatching to the appropriate
+    hardware-accelerated routine at run-time. */
+void
+unshuffle(const size_t bytesoftype, const size_t blocksize,
+          const uint8_t* _src, const uint8_t* _dest) {
+  /* Initialize the shuffle implementation if necessary. */
+  init_shuffle_implementation();
+
+  /*  The implementation is initialized.
+      Dispatch to it's unshuffle routine. */
+  (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
 }
 
-void unshuffle(size_t bytesoftype, size_t blocksize,
-               uint8_t* _src, uint8_t* _dest) {
-  _unshuffle(bytesoftype, blocksize, _src, _dest);
+/*  Bit-shuffle a block by dynamically dispatching to the appropriate
+    hardware-accelerated routine at run-time. */
+int
+bitshuffle(const size_t bytesoftype, const size_t blocksize,
+           const uint8_t* const _src, const uint8_t* _dest,
+           const uint8_t* _tmp) {
+  int size = blocksize / bytesoftype;
+  /* Initialize the shuffle implementation if necessary. */
+  init_shuffle_implementation();
+
+  if ((size % 8) == 0)
+    /* The number of elems is a multiple of 8 which is supported by
+       bitshuffle. */
+    return (int)(host_implementation.bitshuffle)((void*)_src, (void*)_dest,
+                                                 blocksize / bytesoftype,
+                                                 bytesoftype, (void*)_tmp);
+  else
+    memcpy((void*)_dest, (void*)_src, blocksize);
+  return size;
 }
 
-#endif  /* __SSE2__ */
+/*  Bit-unshuffle a block by dynamically dispatching to the appropriate
+    hardware-accelerated routine at run-time. */
+int
+bitunshuffle(const size_t bytesoftype, const size_t blocksize,
+             const uint8_t* const _src, const uint8_t* _dest,
+             const uint8_t* _tmp) {
+  int size = blocksize / bytesoftype;
+  /* Initialize the shuffle implementation if necessary. */
+  init_shuffle_implementation();
+
+  if ((size % 8) == 0)
+    /* The number of elems is a multiple of 8 which is supported by
+       bitshuffle. */
+    return (int)(host_implementation.bitunshuffle)((void*)_src, (void*)_dest,
+                                                   blocksize / bytesoftype,
+                                                   bytesoftype, (void*)_tmp);
+  else
+    memcpy((void*)_dest, (void*)_src, blocksize);
+  return size;
+}
diff --git a/c-blosc/blosc/shuffle.h b/c-blosc/blosc/shuffle.h
index d30d84eea..d0b6ddc6e 100644
--- a/c-blosc/blosc/shuffle.h
+++ b/c-blosc/blosc/shuffle.h
@@ -1,16 +1,67 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
-  Author: Francesc Alted <francesc@blosc.io>
+  Author: Francesc Alted <francesc@blosc.org>
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
 
+/*  Shuffle/unshuffle routines which dynamically dispatch to hardware-
+    accelerated routines based on the processor's architecture.
+    Consumers should almost always prefer to call these routines instead
+    of directly calling one of the hardware-accelerated routines, since
+    these are cross-platform and future-proof. */
 
-/* Shuffle/unshuffle routines */
+#ifndef SHUFFLE_H
+#define SHUFFLE_H
 
-void shuffle(size_t bytesoftype, size_t blocksize,
-             unsigned char* _src, unsigned char* _dest);
+#include "shuffle-common.h"
 
-void unshuffle(size_t bytesoftype, size_t blocksize,
-               unsigned char* _src, unsigned char* _dest);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  Primary shuffle and bitshuffle routines.
+  This function dynamically dispatches to the appropriate hardware-accelerated
+  routine based on the host processor's architecture. If the host processor is
+  not supported by any of the hardware-accelerated routines, the generic
+  (non-accelerated) implementation is used instead.
+  Consumers should almost always prefer to call this routine instead of directly
+  calling the hardware-accelerated routines because this method is both cross-
+  platform and future-proof.
+*/
+BLOSC_NO_EXPORT void
+shuffle(const size_t bytesoftype, const size_t blocksize,
+        const uint8_t* _src, const uint8_t* _dest);
+
+BLOSC_NO_EXPORT int
+bitshuffle(const size_t bytesoftype, const size_t blocksize,
+           const uint8_t* const _src, const uint8_t* _dest,
+           const uint8_t* _tmp);
+
+/**
+  Primary unshuffle and bitunshuffle routine.
+  This function dynamically dispatches to the appropriate hardware-accelerated
+  routine based on the host processor's architecture. If the host processor is
+  not supported by any of the hardware-accelerated routines, the generic
+  (non-accelerated) implementation is used instead.
+  Consumers should almost always prefer to call this routine instead of directly
+  calling the hardware-accelerated routines because this method is both cross-
+  platform and future-proof.
+*/
+BLOSC_NO_EXPORT void
+unshuffle(const size_t bytesoftype, const size_t blocksize,
+          const uint8_t* _src, const uint8_t* _dest);
+
+
+BLOSC_NO_EXPORT int
+bitunshuffle(const size_t bytesoftype, const size_t blocksize,
+             const uint8_t* const _src, const uint8_t* _dest,
+             const uint8_t* _tmp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SHUFFLE_H */
diff --git a/c-blosc/cmake/FindSSE.cmake b/c-blosc/cmake/FindSSE.cmake
deleted file mode 100644
index b5e4ce1bb..000000000
--- a/c-blosc/cmake/FindSSE.cmake
+++ /dev/null
@@ -1,125 +0,0 @@
-# Check if SSE/AVX instructions are available on the machine where
-# the project is compiled.
-
-IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
-   EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
-
-   STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "sse2" "${SSE_THERE}" SSE2_TRUE)
-   IF (SSE2_TRUE)
-      set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
-   ELSE (SSE2_TRUE)
-      set(SSE2_FOUND false CACHE BOOL "SSE2 available on host")
-   ENDIF (SSE2_TRUE)
-
-   # /proc/cpuinfo apparently omits sse3 :(
-   STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "sse3" "${SSE_THERE}" SSE3_TRUE)
-   IF (NOT SSE3_TRUE)
-      STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE ${CPUINFO})
-      STRING(COMPARE EQUAL "T2300" "${SSE_THERE}" SSE3_TRUE)
-   ENDIF (NOT SSE3_TRUE)
-
-   STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "ssse3" "${SSE_THERE}" SSSE3_TRUE)
-   IF (SSE3_TRUE OR SSSE3_TRUE)
-      set(SSE3_FOUND true CACHE BOOL "SSE3 available on host")
-   ELSE (SSE3_TRUE OR SSSE3_TRUE)
-      set(SSE3_FOUND false CACHE BOOL "SSE3 available on host")
-   ENDIF (SSE3_TRUE OR SSSE3_TRUE)
-   IF (SSSE3_TRUE)
-      set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host")
-   ELSE (SSSE3_TRUE)
-      set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
-   ENDIF (SSSE3_TRUE)
-
-   STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "sse4_1" "${SSE_THERE}" SSE41_TRUE)
-   IF (SSE41_TRUE)
-      set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host")
-   ELSE (SSE41_TRUE)
-      set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
-   ENDIF (SSE41_TRUE)
-
-   STRING(REGEX REPLACE "^.*(avx).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "avx" "${SSE_THERE}" AVX_TRUE)
-   IF (AVX_TRUE)
-      set(AVX_FOUND true CACHE BOOL "AVX available on host")
-   ELSE (AVX_TRUE)
-      set(AVX_FOUND false CACHE BOOL "AVX available on host")
-   ENDIF (AVX_TRUE)
-ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
-   EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE
-      CPUINFO)
-
-   STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "SSE2" "${SSE_THERE}" SSE2_TRUE)
-   IF (SSE2_TRUE)
-      set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
-   ELSE (SSE2_TRUE)
-      set(SSE2_FOUND false CACHE BOOL "SSE2 available on host")
-   ENDIF (SSE2_TRUE)
-
-   STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "SSE3" "${SSE_THERE}" SSE3_TRUE)
-   IF (SSE3_TRUE)
-      set(SSE3_FOUND true CACHE BOOL "SSE3 available on host")
-   ELSE (SSE3_TRUE)
-      set(SSE3_FOUND false CACHE BOOL "SSE3 available on host")
-   ENDIF (SSE3_TRUE)
-
-   STRING(REGEX REPLACE "^.*(SSSE3).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "SSSE3" "${SSE_THERE}" SSSE3_TRUE)
-   IF (SSSE3_TRUE)
-      set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host")
-   ELSE (SSSE3_TRUE)
-      set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
-   ENDIF (SSSE3_TRUE)
-
-   STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "SSE4.1" "${SSE_THERE}" SSE41_TRUE)
-   IF (SSE41_TRUE)
-      set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host")
-   ELSE (SSE41_TRUE)
-      set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
-   ENDIF (SSE41_TRUE)
-
-   STRING(REGEX REPLACE "^.*(AVX).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "AVX" "${SSE_THERE}" AVX_TRUE)
-   IF (AVX_TRUE)
-      set(AVX_FOUND true CACHE BOOL "AVX available on host")
-   ELSE (AVX_TRUE)
-      set(AVX_FOUND false CACHE BOOL "AVX available on host")
-   ENDIF (AVX_TRUE)
-ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows")
-   # TODO
-   set(SSE2_FOUND   true  CACHE BOOL "SSE2 available on host")
-   set(SSE3_FOUND   false CACHE BOOL "SSE3 available on host")
-   set(SSSE3_FOUND  false CACHE BOOL "SSSE3 available on host")
-   set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
-   set(AVX_FOUND false CACHE BOOL "AVX available on host")
-ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux")
-   set(SSE2_FOUND   true  CACHE BOOL "SSE2 available on host")
-   set(SSE3_FOUND   false CACHE BOOL "SSE3 available on host")
-   set(SSSE3_FOUND  false CACHE BOOL "SSSE3 available on host")
-   set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
-   set(AVX_FOUND false CACHE BOOL "AVX available on host")
-ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux")
-
-if(NOT SSE2_FOUND)
-      MESSAGE(STATUS "Could not find hardware support for SSE2 on this machine.")
-endif(NOT SSE2_FOUND)
-if(NOT SSE3_FOUND)
-      MESSAGE(STATUS "Could not find hardware support for SSE3 on this machine.")
-endif(NOT SSE3_FOUND)
-if(NOT SSSE3_FOUND)
-      MESSAGE(STATUS "Could not find hardware support for SSSE3 on this machine.")
-endif(NOT SSSE3_FOUND)
-if(NOT SSE4_1_FOUND)
-      MESSAGE(STATUS "Could not find hardware support for SSE4.1 on this machine.")
-endif(NOT SSE4_1_FOUND)
-if(NOT AVX_FOUND)
-      MESSAGE(STATUS "Could not find hardware support for AVX on this machine.")
-endif(NOT AVX_FOUND)
-
-mark_as_advanced(SSE2_FOUND SSE3_FOUND SSSE3_FOUND SSE4_1_FOUND, AVX_FOUND)
diff --git a/c-blosc/examples/multithread.c b/c-blosc/examples/multithread.c
index 4198cfec3..556565289 100644
--- a/c-blosc/examples/multithread.c
+++ b/c-blosc/examples/multithread.c
@@ -56,7 +56,7 @@ int main(){
 
   /* Register the filter with the library */
   printf("Blosc version info: %s (%s)\n",
-	 BLOSC_VERSION_STRING, BLOSC_VERSION_DATE);
+         BLOSC_VERSION_STRING, BLOSC_VERSION_DATE);
 
   /* Initialize the Blosc compressor */
   blosc_init();
@@ -83,18 +83,18 @@ int main(){
         return dsize;
     }
 
-    /* After using it, destroy the Blosc environment */
-    blosc_destroy();
-
     for(i=0;i<SIZE;i++){
       if(data[i] != data_dest[i]) {
-	printf("Decompressed data differs from original!\n");
-	return -1;
+        printf("Decompressed data differs from original!\n");
+        return -1;
       }
     }
 
     printf("Succesful roundtrip!\n");
   }
 
+  /* After using it, destroy the Blosc environment */
+  blosc_destroy();
+
   return 0;
 }
diff --git a/c-blosc/examples/simple.c b/c-blosc/examples/simple.c
index 25ad5f01c..5568c4c44 100644
--- a/c-blosc/examples/simple.c
+++ b/c-blosc/examples/simple.c
@@ -15,7 +15,7 @@
 
     Using MSVC on Windows:
 
-    cl /Ox /Fesimple.exe /Iblosc simple.c blosc\*.c
+    cl /arch:SSE2 /Ox /Fesimple.exe /Iblosc examples\simple.c blosc\blosc.c blosc\blosclz.c blosc\shuffle.c blosc\shuffle-sse2.c blosc\shuffle-generic.c blosc\bitshuffle-generic.c blosc\bitshuffle-sse2.c
 
     To run:
 
@@ -31,8 +31,6 @@
 #include <blosc.h>
 
 #define SIZE 100*100*100
-#define SHAPE {100,100,100}
-#define CHUNKSHAPE {1,100,100}
 
 int main(){
   static float data[SIZE];
diff --git a/c-blosc/examples/win-dynamic-linking.c b/c-blosc/examples/win-dynamic-linking.c
new file mode 100644
index 000000000..28883b119
--- /dev/null
+++ b/c-blosc/examples/win-dynamic-linking.c
@@ -0,0 +1,128 @@
+/*
+    Copyright (C) 2015  Francesc Alted
+    http://blosc.org
+    License: MIT (see LICENSE.txt)
+
+    Example program demonstrating use of the Blosc filter using the Windows Run-Time Dynamic Linking technique:
+    
+    https://msdn.microsoft.com/en-us/library/windows/desktop/ms686944(v=vs.85).aspx
+    
+    This allows to link your app in run-time with DLLs made with different compatible compilers
+    (e.g. VS2013 and mingw-w64).
+
+    To compile this program (be aware that you should match your compiler 32-bit/64-bit with your DLL):
+
+    cl /Ox /Fewin-dynamic-linking.exe /I..\blosc win-dynamic-linking.c
+
+    To run:
+
+    $ win-dynamic-linking.exe
+    Blosc version info: 1.7.0.dev
+    Compression: 400000000 -> 19928862 (20.1x)
+    Decompression succesful!
+    Succesful roundtrip!
+
+*/
+
+#include <stdio.h>
+#include <blosc.h>
+#include <windows.h>
+
+#define SIZE 100*1000*1000
+#define SHAPE {100,1000,1000}
+#define CHUNKSHAPE {1,1000,1000}
+
+/* Definition for the compression and decompression blosc routines */
+typedef int (__cdecl *COMPRESS_CTX)(int clevel, int doshuffle, size_t typesize,
+                                        size_t nbytes, const void* src, void* dest,
+                                        size_t destsize, const char* compressor,
+                                        size_t blocksize, int numinternalthreads);
+
+typedef int (__cdecl *DECOMPRESS_CTX)(const void *src, void *dest,
+                                          size_t destsize, int numinternalthreads);
+typedef char* (__cdecl *GET_VERSION_STRING)(void);
+
+
+int main(){
+  HINSTANCE BDLL;                       /* Handle to DLL */
+  COMPRESS_CTX blosc_compress_ctx;      /* Function pointer for compression */
+  DECOMPRESS_CTX blosc_decompress_ctx;  /* Function pointer for decompression */
+  GET_VERSION_STRING blosc_get_version_string;
+
+  static float data[SIZE];
+  static float data_out[SIZE];
+  static float data_dest[SIZE];
+  int isize = SIZE*sizeof(float), osize = SIZE*sizeof(float);
+  int dsize = SIZE*sizeof(float), csize;
+  int i;
+
+  BDLL = LoadLibrary(TEXT("myblosc.dll"));
+  if (BDLL == NULL) {
+    printf("Cannot find myblosc.dll library!\n");
+    goto out;
+  }
+
+  blosc_compress_ctx = (COMPRESS_CTX)GetProcAddress(BDLL, "blosc_compress_ctx");
+  if (!blosc_compress_ctx) {
+    // handle the error
+    printf("Cannot find blosc_compress_ctx() function!\n");
+    goto out;
+  }
+
+  blosc_decompress_ctx = (DECOMPRESS_CTX)GetProcAddress(BDLL, "blosc_decompress_ctx");
+  if (!blosc_decompress_ctx) {
+    // handle the error
+    printf("Cannot find blosc_decompress_ctx() function!\n");
+    goto out;
+  }
+
+  blosc_get_version_string = (GET_VERSION_STRING)GetProcAddress(BDLL, "blosc_get_version_string");
+  if (!blosc_get_version_string) {
+    // handle the error
+    printf("Cannot find blosc_get_version_string() function!\n");
+    goto out;
+  }
+
+  for(i=0; i<SIZE; i++){
+    data[i] = i;
+  }
+
+  /* Register the filter with the library */
+  printf("Blosc version info: %s\n", blosc_get_version_string());
+
+  /* Compress with clevel=3, shuffle active, 16-bytes data size, blosclz and 2 threads */
+  csize = blosc_compress_ctx(3, 1, 16, isize, data, data_out, osize, "blosclz", 0, 2);
+  if (csize == 0) {
+    printf("Buffer is uncompressible.  Giving up.\n");
+    return 1;
+  }
+  else if (csize < 0) {
+    printf("Compression error.  Error code: %d\n", csize);
+    return csize;
+  }
+
+  printf("Compression: %d -> %d (%.1fx)\n", isize, csize, (1.*isize) / csize);
+
+  /* Decompress  */
+  dsize = blosc_decompress_ctx(data_out, data_dest, dsize, 1);
+  if (dsize < 0) {
+    printf("Decompression error.  Error code: %d\n", dsize);
+    return dsize;
+  }
+
+  printf("Decompression succesful!\n");
+
+  for(i=0;i<SIZE;i++){
+    if(data[i] != data_dest[i]) {
+      printf("Decompressed data differs from original!\n");
+      return -1;
+    }
+  }
+
+  printf("Succesful roundtrip!\n");
+  return 0;
+  
+out:
+  FreeLibrary(BDLL);
+  return -1;
+}
diff --git a/c-blosc/hdf5/CMakeLists.txt b/c-blosc/hdf5/CMakeLists.txt
deleted file mode 100644
index d9cd847e8..000000000
--- a/c-blosc/hdf5/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-# sources
-set(SOURCES blosc_filter.c)
-
-include_directories("${PROJECT_SOURCE_DIR}/blosc")
-
-# dependencies
-find_package(HDF5 REQUIRED)
-include_directories(HDF5_INCLIDE_DIRS)
-
-
-# targets
-add_library(blosc_filter_shared SHARED ${SOURCES})
-set_target_properties(blosc_filter_shared PROPERTIES OUTPUT_NAME blosc_filter)
-target_link_libraries(blosc_filter_shared blosc_shared ${HDF5_LIBRARIES})
-
-if(BUILD_STATIC)
-    add_library(blosc_filter_static ${SOURCES})
-    set_target_properties(
-        blosc_filter_static PROPERTIES OUTPUT_NAME blosc_filter)
-    target_link_libraries(blosc_filter_static blosc_static)
-endif(BUILD_STATIC)
-
-
-# install
-install(FILES blosc_filter.h DESTINATION include COMPONENT HDF5_FILTER_DEV)
-install(TARGETS blosc_filter_static DESTINATION lib COMPONENT HDF5_FILTER)
-if(BUILD_STATIC)
-    install(
-        TARGETS blosc_filter_shared DESTINATION lib COMPONENT HDF5_FILTER_DEV)
-endif(BUILD_STATIC)
-
-
-# test
-if(BUILD_TESTS)
-    add_executable(example example.c)
-    target_link_libraries(example blosc_filter_static ${HDF5_LIBRARIES})
-    add_test(test_hdf5_filter example)
-endif(BUILD_TESTS)
diff --git a/c-blosc/hdf5/README.rst b/c-blosc/hdf5/README.rst
deleted file mode 100644
index 15c6b35fc..000000000
--- a/c-blosc/hdf5/README.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-Using the Blosc filter from HDF5
-================================
-
-In order to register Blosc into your HDF5 application, you only need
-to call a function in blosc_filter.h, with the following signature:
-
-    int register_blosc(char **version, char **date)
-
-Calling this will register the filter with the HDF5 library and will
-return info about the Blosc release in `**version` and `**date`
-char pointers.
-
-A non-negative return value indicates success.  If the registration
-fails, an error is pushed onto the current error stack and a negative
-value is returned.
-
-An example C program ("example.c") is included which demonstrates the
-proper use of the filter.
-
-This filter has been tested against HDF5 versions 1.6.5 through
-1.8.10.  It is released under the MIT license (see LICENSE.txt for
-details).
-
-
-Compiling
-=========
-
-The filter consists of a single '.c' source file and '.h' header,
-along with an embedded version of the BLOSC compression library.
-Also, as Blosc uses SSE2 and multithreading, you must remember to use
-some special flags and libraries to make sure that these features are
-used (only necessary when compiling Blosc from sources).
-
-To compile using GCC on UNIX:
-
-  gcc -O3 -msse2 -lhdf5 ../blosc/*.c blosc_filter.c \
-        example.c -o example -lpthread
-
-or, if you have the Blosc library already installed (recommended):
-
-  gcc -O3 -lhdf5 -lblosc blosc_filter.c example.c -o example -lpthread
-
-Using MINGW on Windows:
-
-  gcc -O3 -lhdf5 -lblosc blosc_filter.c example.c -o example
-
-Using Windows and MSVC (2008 or higher recommended):
-
-  cl /Ox /Feexample.exe example.c ..\blosc\*.c blosc_filter.c
-
-Intel ICC compilers should work too.
-
-For activating the support for other compressors than the integrated
-BloscLZ (like LZ4, LZ4HC, Snappy or Zlib) see the README file in the
-main Blosc directory.
-
-
-Acknowledgments
-===============
-
-This HDF5 filter interface and its example is based in the LZF interface
-(http://h5py.alfven.org) by Andrew Collette.
diff --git a/c-blosc/internal-complibs/lz4-1.7.2/lz4.c b/c-blosc/internal-complibs/lz4-1.7.2/lz4.c
new file mode 100644
index 000000000..ff4864436
--- /dev/null
+++ b/c-blosc/internal-complibs/lz4-1.7.2/lz4.c
@@ -0,0 +1,1564 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/Cyan4973/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+
+/**************************************
+*  Tuning parameters
+**************************************/
+/*
+ * HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ */
+#define HEAPMODE 0
+
+/*
+ * ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define ACCELERATION_DEFAULT 1
+
+
+/**************************************
+*  CPU Feature Detection
+**************************************/
+/* LZ4_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which generate assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef LZ4_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define LZ4_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define LZ4_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for Windows CE does not support Hardware bit count */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+/**************************************
+*  Includes
+**************************************/
+#include "lz4.h"
+
+
+/**************************************
+*  Compiler Options
+**************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4293)        /* disable: C4293: too large shift (32-bits) */
+#else
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+#    if defined(__GNUC__) || defined(__clang__)
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif   /* __STDC_VERSION__ */
+#endif  /* _MSC_VER */
+
+/* LZ4_GCC_VERSION is defined into lz4.h */
+#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#define likely(expr)     expect((expr) != 0, 1)
+#define unlikely(expr)   expect((expr) != 0, 0)
+
+
+/**************************************
+*  Memory routines
+**************************************/
+#include <stdlib.h>   /* malloc, calloc, free */
+#define ALLOCATOR(n,s) calloc(n,s)
+#define FREEMEM        free
+#include <string.h>   /* memset, memcpy */
+#define MEM_INIT       memset
+
+
+/**************************************
+*  Basic Types
+**************************************/
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+#endif
+
+
+/**************************************
+*  Reading and writing into memory
+**************************************/
+#define STEPSIZE sizeof(size_t)
+
+static unsigned LZ4_64bits(void) { return sizeof(void*)==8; }
+
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 i; BYTE c[4]; } one = { 1 };   // don't use static : performance detrimental
+    return one.c[0];
+}
+
+
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
+
+static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static size_t LZ4_read_ARCH(const void* memPtr) { return *(const size_t*) memPtr; }
+
+static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; size_t uArch; } __attribute__((packed)) unalign;
+
+static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+static U32 LZ4_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static size_t LZ4_read_ARCH(const void* ptr) { return ((const unalign*)ptr)->uArch; }
+
+static void LZ4_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+static U16 LZ4_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static size_t LZ4_read_ARCH(const void* memPtr)
+{
+    size_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static void LZ4_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif // LZ4_FORCE_MEMORY_ACCESS
+
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+    if (LZ4_isLittleEndian())
+    {
+        return LZ4_read16(memPtr);
+    }
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + (p[1]<<8));
+    }
+}
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+    if (LZ4_isLittleEndian())
+    {
+        LZ4_write16(memPtr, value);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE) value;
+        p[1] = (BYTE)(value>>8);
+    }
+}
+
+static void LZ4_copy8(void* dst, const void* src)
+{
+    memcpy(dst,src,8);
+}
+
+/* customized variant of memcpy, which can overwrite up to 7 bytes beyond dstEnd */
+static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+#if 0
+    const size_t l2 = 8 - (((size_t)d) & (sizeof(void*)-1));
+    LZ4_copy8(d,s); if (d>e-9) return;
+    d+=l2; s+=l2;
+#endif /* join to align */
+
+    do { LZ4_copy8(d,s); d+=8; s+=8; } while (d<e);
+}
+
+
+/**************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (WILDCOPYLENGTH+MINMATCH)
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/**************************************
+*  Common Utils
+**************************************/
+#define LZ4_STATIC_ASSERT(c)    { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/**************************************
+*  Common functions
+**************************************/
+static unsigned LZ4_NbCommonBytes (register size_t val)
+{
+    if (LZ4_isLittleEndian())
+    {
+        if (LZ4_64bits())
+        {
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        }
+        else /* 32 bits */
+        {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r;
+            _BitScanForward( &r, (U32)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    }
+    else   /* Big Endian CPU */
+    {
+        if (LZ4_64bits())
+        {
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clzll((U64)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        }
+        else /* 32 bits */
+        {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        }
+    }
+}
+
+static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    while (likely(pIn<pInLimit-(STEPSIZE-1)))
+    {
+        size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+
+    if (LZ4_64bits()) if ((pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (unsigned)(pIn - pStart);
+}
+
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/**************************************
+*  Local Constants
+**************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
+
+
+/**************************************
+*  Local Structures and types
+**************************************/
+typedef struct {
+    U32 hashTable[HASH_SIZE_U32];
+    U32 currentOffset;
+    U32 initCheck;
+    const BYTE* dictionary;
+    BYTE* bufferStart;   /* obsolete, used for slideInputBuffer */
+    U32 dictSize;
+} LZ4_stream_t_internal;
+
+typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
+
+typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+
+
+/**************************************
+*  Local Utils
+**************************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
+
+
+
+/********************************
+*  Compression functions
+********************************/
+
+static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType)
+{
+    if (tableType == byU16)
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+static const U64 prime5bytes = 889523592379ULL;
+static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType)
+{
+    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
+    const U32 hashMask = (1<<hashLog) - 1;
+    return ((sequence * prime5bytes) >> (40 - hashLog)) & hashMask;
+}
+
+static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType)
+{
+    if (LZ4_64bits())
+        return LZ4_hashSequence64(sequence, tableType);
+    return LZ4_hashSequence((U32)sequence, tableType);
+}
+
+static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); }
+
+static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase)
+{
+    switch (tableType)
+    {
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; }
+    }
+}
+
+static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
+    if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
+    { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
+}
+
+static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+FORCE_INLINE int LZ4_compress_generic(
+                 void* const ctx,
+                 const char* const source,
+                 char* const dest,
+                 const int inputSize,
+                 const int maxOutputSize,
+                 const limitedOutput_directive outputLimited,
+                 const tableType_t tableType,
+                 const dict_directive dict,
+                 const dictIssue_directive dictIssue,
+                 const U32 acceleration)
+{
+    LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx;
+
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* base;
+    const BYTE* lowLimit;
+    const BYTE* const lowRefLimit = ip - dictPtr->dictSize;
+    const BYTE* const dictionary = dictPtr->dictionary;
+    const BYTE* const dictEnd = dictionary + dictPtr->dictSize;
+    const size_t dictDelta = dictEnd - (const BYTE*)source;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    U32 forwardH;
+    size_t refDelta=0;
+
+    /* Init conditions */
+    if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0;   /* Unsupported input size, too large (or negative) */
+    switch(dict)
+    {
+    case noDict:
+    default:
+        base = (const BYTE*)source;
+        lowLimit = (const BYTE*)source;
+        break;
+    case withPrefix64k:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source - dictPtr->dictSize;
+        break;
+    case usingExtDict:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source;
+        break;
+    }
+    if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
+    if (inputSize<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    LZ4_putPosition(ip, ctx, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; )
+    {
+        const BYTE* match;
+        BYTE* token;
+        {
+            const BYTE* forwardIp = ip;
+            unsigned step = 1;
+            unsigned searchMatchNb = acceleration << LZ4_skipTrigger;
+
+            /* Find a match */
+            do {
+                U32 h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimit)) goto _last_literals;
+
+                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
+                if (dict==usingExtDict)
+                {
+                    if (match<(const BYTE*)source)
+                    {
+                        refDelta = dictDelta;
+                        lowLimit = dictionary;
+                    }
+                    else
+                    {
+                        refDelta = 0;
+                        lowLimit = (const BYTE*)source;
+                    }
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+            } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0)
+                || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
+                || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) );
+        }
+
+        /* Catch up */
+        while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; }
+
+        {
+            /* Encode Literal length */
+            unsigned litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)))
+                return 0;   /* Check output limit */
+            if (litLength>=RUN_MASK)
+            {
+                int len = (int)litLength-RUN_MASK;
+                *token=(RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy(op, anchor, op+litLength);
+            op+=litLength;
+        }
+
+_next_match:
+        /* Encode Offset */
+        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
+
+        /* Encode MatchLength */
+        {
+            unsigned matchLength;
+
+            if ((dict==usingExtDict) && (lowLimit==dictionary))
+            {
+                const BYTE* limit;
+                match += refDelta;
+                limit = ip + (dictEnd-match);
+                if (limit > matchlimit) limit = matchlimit;
+                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
+                ip += MINMATCH + matchLength;
+                if (ip==limit)
+                {
+                    unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit);
+                    matchLength += more;
+                    ip += more;
+                }
+            }
+            else
+            {
+                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+                ip += MINMATCH + matchLength;
+            }
+
+            if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit)))
+                return 0;    /* Check output limit */
+            if (matchLength>=ML_MASK)
+            {
+                *token += ML_MASK;
+                matchLength -= ML_MASK;
+                for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; }
+                if (matchLength >= 255) { matchLength-=255; *op++ = 255; }
+                *op++ = (BYTE)matchLength;
+            }
+            else *token += (BYTE)(matchLength);
+        }
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip > mflimit) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, ctx, tableType, base);
+
+        /* Test next position */
+        match = LZ4_getPosition(ip, ctx, tableType, base);
+        if (dict==usingExtDict)
+        {
+            if (match<(const BYTE*)source)
+            {
+                refDelta = dictDelta;
+                lowLimit = dictionary;
+            }
+            else
+            {
+                refDelta = 0;
+                lowLimit = (const BYTE*)source;
+            }
+        }
+        LZ4_putPosition(ip, ctx, tableType, base);
+        if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1)
+            && (match+MAX_DISTANCE>=ip)
+            && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) )
+        { token=op++; *token=0; goto _next_match; }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {
+        const size_t lastRun = (size_t)(iend - anchor);
+        if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize))
+            return 0;   /* Check output limit */
+        if (lastRun >= RUN_MASK)
+        {
+            size_t accumulator = lastRun - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        }
+        else
+        {
+            *op++ = (BYTE)(lastRun<<ML_BITS);
+        }
+        memcpy(op, anchor, lastRun);
+        op += lastRun;
+    }
+
+    /* End */
+    return (int) (((char*)op)-dest);
+}
+
+
+int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_resetStream((LZ4_stream_t*)state);
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    if (maxOutputSize >= LZ4_compressBound(inputSize))
+    {
+        if (inputSize < LZ4_64Klimit)
+            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16,                        noDict, noDictIssue, acceleration);
+        else
+            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+    }
+    else
+    {
+        if (inputSize < LZ4_64Klimit)
+            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
+        else
+            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+    }
+}
+
+
+int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+#if (HEAPMODE)
+    void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+#else
+    LZ4_stream_t ctx;
+    void* ctxPtr = &ctx;
+#endif
+
+    int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
+
+#if (HEAPMODE)
+    FREEMEM(ctxPtr);
+#endif
+    return result;
+}
+
+
+int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1);
+}
+
+
+/* hidden debug function */
+/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */
+int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t ctx;
+
+    LZ4_resetStream(&ctx);
+
+    if (inputSize < LZ4_64Klimit)
+        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
+    else
+        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+}
+
+
+/********************************
+*  destSize variant
+********************************/
+
+static int LZ4_compress_destSize_generic(
+                       void* const ctx,
+                 const char* const src,
+                       char* const dst,
+                       int*  const srcSizePtr,
+                 const int targetDstSize,
+                 const tableType_t tableType)
+{
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* base = (const BYTE*) src;
+    const BYTE* lowLimit = (const BYTE*) src;
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + *srcSizePtr;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    BYTE* op = (BYTE*) dst;
+    BYTE* const oend = op + targetDstSize;
+    BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */;
+    BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */);
+    BYTE* const oMaxSeq = oMaxLit - 1 /* token */;
+
+    U32 forwardH;
+
+
+    /* Init conditions */
+    if (targetDstSize < 1) return 0;                                     /* Impossible to store anything */
+    if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0;            /* Unsupported input size, too large (or negative) */
+    if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
+    if (*srcSizePtr<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    *srcSizePtr = 0;
+    LZ4_putPosition(ip, ctx, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; )
+    {
+        const BYTE* match;
+        BYTE* token;
+        {
+            const BYTE* forwardIp = ip;
+            unsigned step = 1;
+            unsigned searchMatchNb = 1 << LZ4_skipTrigger;
+
+            /* Find a match */
+            do {
+                U32 h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimit))
+                    goto _last_literals;
+
+                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+            } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
+                || (LZ4_read32(match) != LZ4_read32(ip)) );
+        }
+
+        /* Catch up */
+        while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
+
+        {
+            /* Encode Literal length */
+            unsigned litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if (op + ((litLength+240)/255) + litLength > oMaxLit)
+            {
+                /* Not enough space for a last match */
+                op--;
+                goto _last_literals;
+            }
+            if (litLength>=RUN_MASK)
+            {
+                unsigned len = litLength - RUN_MASK;
+                *token=(RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy(op, anchor, op+litLength);
+            op += litLength;
+        }
+
+_next_match:
+        /* Encode Offset */
+        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
+
+        /* Encode MatchLength */
+        {
+            size_t matchLength;
+
+            matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+
+            if (op + ((matchLength+240)/255) > oMaxMatch)
+            {
+                /* Match description too long : reduce it */
+                matchLength = (15-1) + (oMaxMatch-op) * 255;
+            }
+            ip += MINMATCH + matchLength;
+
+            if (matchLength>=ML_MASK)
+            {
+                *token += ML_MASK;
+                matchLength -= ML_MASK;
+                while (matchLength >= 255) { matchLength-=255; *op++ = 255; }
+                *op++ = (BYTE)matchLength;
+            }
+            else *token += (BYTE)(matchLength);
+        }
+
+        anchor = ip;
+
+        /* Test end of block */
+        if (ip > mflimit) break;
+        if (op > oMaxSeq) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, ctx, tableType, base);
+
+        /* Test next position */
+        match = LZ4_getPosition(ip, ctx, tableType, base);
+        LZ4_putPosition(ip, ctx, tableType, base);
+        if ( (match+MAX_DISTANCE>=ip)
+            && (LZ4_read32(match)==LZ4_read32(ip)) )
+        { token=op++; *token=0; goto _next_match; }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {
+        size_t lastRunSize = (size_t)(iend - anchor);
+        if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend)
+        {
+            /* adapt lastRunSize to fill 'dst' */
+            lastRunSize  = (oend-op) - 1;
+            lastRunSize -= (lastRunSize+240)/255;
+        }
+        ip = anchor + lastRunSize;
+
+        if (lastRunSize >= RUN_MASK)
+        {
+            size_t accumulator = lastRunSize - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        }
+        else
+        {
+            *op++ = (BYTE)(lastRunSize<<ML_BITS);
+        }
+        memcpy(op, anchor, lastRunSize);
+        op += lastRunSize;
+    }
+
+    /* End */
+    *srcSizePtr = (int) (((const char*)ip)-src);
+    return (int) (((char*)op)-dst);
+}
+
+
+static int LZ4_compress_destSize_extState (void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+    LZ4_resetStream((LZ4_stream_t*)state);
+
+    if (targetDstSize >= LZ4_compressBound(*srcSizePtr))   /* compression success is guaranteed */
+    {
+        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
+    }
+    else
+    {
+        if (*srcSizePtr < LZ4_64Klimit)
+            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16);
+        else
+            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr);
+    }
+}
+
+
+int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+#if (HEAPMODE)
+    void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+#else
+    LZ4_stream_t ctxBody;
+    void* ctx = &ctxBody;
+#endif
+
+    int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
+
+#if (HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+
+/********************************
+*  Streaming functions
+********************************/
+
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64);
+    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));    /* A compilation error here means LZ4_STREAMSIZE is not large enough */
+    LZ4_resetStream(lz4s);
+    return lz4s;
+}
+
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t));
+}
+
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+
+
+#define HASH_UNIT sizeof(size_t)
+int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    const BYTE* base;
+
+    if ((dict->initCheck) || (dict->currentOffset > 1 GB))  /* Uninitialized structure, or reuse overflow */
+        LZ4_resetStream(LZ4_dict);
+
+    if (dictSize < (int)HASH_UNIT)
+    {
+        dict->dictionary = NULL;
+        dict->dictSize = 0;
+        return 0;
+    }
+
+    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
+    dict->currentOffset += 64 KB;
+    base = p - dict->currentOffset;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->currentOffset += dict->dictSize;
+
+    while (p <= dictEnd-HASH_UNIT)
+    {
+        LZ4_putPosition(p, dict->hashTable, byU32, base);
+        p+=3;
+    }
+
+    return dict->dictSize;
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src)
+{
+    if ((LZ4_dict->currentOffset > 0x80000000) ||
+        ((size_t)LZ4_dict->currentOffset > (size_t)src))   /* address space overflow */
+    {
+        /* rescale hash table */
+        U32 delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        for (i=0; i<HASH_SIZE_U32; i++)
+        {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = (const BYTE*) source;
+    if (streamPtr->initCheck) return 0;   /* Uninitialized structure detected */
+    if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd;
+    LZ4_renormDictT(streamPtr, smallest);
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    /* Check overlapping input/dictionary space */
+    {
+        const BYTE* sourceEnd = (const BYTE*) source + inputSize;
+        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd))
+        {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == (const BYTE*)source)
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration);
+        streamPtr->dictSize += (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+
+    /* external dictionary mode */
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration);
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+}
+
+
+/* Hidden debug function, to force external dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict;
+    int result;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = dictEnd;
+    if (smallest > (const BYTE*) source) smallest = (const BYTE*) source;
+    LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest);
+
+    result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)inputSize;
+    streamPtr->currentOffset += (U32)inputSize;
+
+    return result;
+}
+
+
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* previousDictEnd = dict->dictionary + dict->dictSize;
+
+    if ((U32)dictSize > 64 KB) dictSize = 64 KB;   /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize;
+
+    memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return dictSize;
+}
+
+
+
+/*******************************
+*  Decompression functions
+*******************************/
+/*
+ * This generic decompression function cover all use cases.
+ * It shall be instantiated several times, using different sets of directives
+ * Note that it is essential this generic function is really inlined,
+ * in order to remove useless branches during compilation optimization.
+ */
+FORCE_INLINE int LZ4_decompress_generic(
+                 const char* const source,
+                 char* const dest,
+                 int inputSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */
+
+                 int endOnInput,         /* endOnOutputSize, endOnInputSize */
+                 int partialDecoding,    /* full, partial */
+                 int targetOutputSize,   /* only used if partialDecoding==partial */
+                 int dict,               /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* == dest if dict == noDict */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
+{
+    /* Local Variables */
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + outputSize;
+    BYTE* cpy;
+    BYTE* oexit = op + targetOutputSize;
+    const BYTE* const lowLimit = lowPrefix - dictSize;
+
+    const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
+    const unsigned dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
+    const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+
+    const int safeDecode = (endOnInput==endOnInputSize);
+    const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+
+
+    /* Special cases */
+    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;                         /* targetOutputSize too high => decode everything */
+    if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1;  /* Empty output buffer */
+    if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1);
+
+
+    /* Main Loop */
+    while (1)
+    {
+        unsigned token;
+        size_t length;
+        const BYTE* match;
+        size_t offset;
+
+        /* get literal length */
+        token = *ip++;
+        if ((length=(token>>ML_BITS)) == RUN_MASK)
+        {
+            unsigned s;
+            do
+            {
+                s = *ip++;
+                length += s;
+            }
+            while ( likely(endOnInput ? ip<iend-RUN_MASK : 1) && (s==255) );
+            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error;   /* overflow detection */
+            if ((safeDecode) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error;   /* overflow detection */
+        }
+
+        /* copy literals */
+        cpy = op+length;
+        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
+            || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)))
+        {
+            if (partialDecoding)
+            {
+                if (cpy > oend) goto _output_error;                           /* Error : write attempt beyond end of output buffer */
+                if ((endOnInput) && (ip+length > iend)) goto _output_error;   /* Error : read attempt beyond end of input buffer */
+            }
+            else
+            {
+                if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
+                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
+            }
+            memcpy(op, ip, length);
+            ip += length;
+            op += length;
+            break;     /* Necessarily EOF, due to parsing restrictions */
+        }
+        LZ4_wildCopy(op, ip, cpy);
+        ip += length; op = cpy;
+
+        /* get offset */
+        offset = LZ4_readLE16(ip); ip+=2;
+        match = op - offset;
+        if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error;   /* Error : offset outside buffers */
+
+        /* get matchlength */
+        length = token & ML_MASK;
+        if (length == ML_MASK)
+        {
+            unsigned s;
+            do
+            {
+                if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error;
+                s = *ip++;
+                length += s;
+            } while (s==255);
+            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error;   /* overflow detection */
+        }
+        length += MINMATCH;
+
+        /* check external dictionary */
+        if ((dict==usingExtDict) && (match < lowPrefix))
+        {
+            if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error;   /* doesn't respect parsing restriction */
+
+            if (length <= (size_t)(lowPrefix-match))
+            {
+                /* match can be copied as a single segment from external dictionary */
+                match = dictEnd - (lowPrefix-match);
+                memmove(op, match, length); op += length;
+            }
+            else
+            {
+                /* match encompass external dictionary and current block */
+                size_t copySize = (size_t)(lowPrefix-match);
+                memcpy(op, dictEnd - copySize, copySize);
+                op += copySize;
+                copySize = length - copySize;
+                if (copySize > (size_t)(op-lowPrefix))   /* overlap copy */
+                {
+                    BYTE* const endOfMatch = op + copySize;
+                    const BYTE* copyFrom = lowPrefix;
+                    while (op < endOfMatch) *op++ = *copyFrom++;
+                }
+                else
+                {
+                    memcpy(op, lowPrefix, copySize);
+                    op += copySize;
+                }
+            }
+            continue;
+        }
+
+        /* copy match within block */
+        cpy = op + length;
+        if (unlikely(offset<8))
+        {
+            const int dec64 = dec64table[offset];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[offset];
+            memcpy(op+4, match, 4);
+            match -= dec64;
+        } else { LZ4_copy8(op, match); match+=8; }
+        op += 8;
+
+        if (unlikely(cpy>oend-12))
+        {
+            BYTE* const oCopyLimit = oend-(WILDCOPYLENGTH-1);
+            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+            if (op < oCopyLimit)
+            {
+                LZ4_wildCopy(op, match, oCopyLimit);
+                match += oCopyLimit - op;
+                op = oCopyLimit;
+            }
+            while (op<cpy) *op++ = *match++;
+        }
+        else
+            LZ4_wildCopy(op, match, cpy);
+        op=cpy;   /* correction */
+    }
+
+    /* end of decoding */
+    if (endOnInput)
+       return (int) (((char*)op)-dest);     /* Nb of output bytes decoded */
+    else
+       return (int) (((const char*)ip)-source);   /* Nb of input bytes read */
+
+    /* Overflow error detected */
+_output_error:
+    return (int) (-(((const char*)ip)-source))-1;
+}
+
+
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, (BYTE*)dest, NULL, 0);
+}
+
+int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, (BYTE*)dest, NULL, 0);
+}
+
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)(dest - 64 KB), NULL, 64 KB);
+}
+
+
+/* streaming decompression functions */
+
+typedef struct
+{
+    const BYTE* externalDict;
+    size_t extDictSize;
+    const BYTE* prefixEnd;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+/*
+ * If you prefer dynamic allocation methods,
+ * LZ4_createStreamDecode()
+ * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(1, sizeof(LZ4_streamDecode_t));
+    return lz4s;
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary
+ * This function is not necessary if previous data is still available where it was decoded.
+ * Loading a size of 0 is allowed (same effect as no dictionary).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    lz4sd->prefixSize = (size_t) dictSize;
+    lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
+    lz4sd->externalDict = NULL;
+    lz4sd->extDictSize  = 0;
+    return 1;
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setStreamDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    if (lz4sd->prefixEnd == (BYTE*)dest)
+    {
+        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                        endOnInputSize, full, 0,
+                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += result;
+        lz4sd->prefixEnd  += result;
+    }
+    else
+    {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                        endOnInputSize, full, 0,
+                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = result;
+        lz4sd->prefixEnd  = (BYTE*)dest + result;
+    }
+
+    return result;
+}
+
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    if (lz4sd->prefixEnd == (BYTE*)dest)
+    {
+        result = LZ4_decompress_generic(source, dest, 0, originalSize,
+                                        endOnOutputSize, full, 0,
+                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += originalSize;
+        lz4sd->prefixEnd  += originalSize;
+    }
+    else
+    {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize;
+        result = LZ4_decompress_generic(source, dest, 0, originalSize,
+                                        endOnOutputSize, full, 0,
+                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = originalSize;
+        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0);
+    if (dictStart+dictSize == dest)
+    {
+        if (dictSize >= (int)(64 KB - 1))
+            return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0);
+        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0);
+    }
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize);
+}
+
+/* debug function */
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+
+/***************************************************
+*  Obsolete Functions
+***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); }
+int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); }
+int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); }
+int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); }
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); }
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); }
+
+/*
+These function names are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
+
+static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base)
+{
+    MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE);
+    lz4ds->bufferStart = base;
+}
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+    if ((((size_t)state) & 3) != 0) return 1;   /* Error : pointer is not aligned on 4-bytes boundary */
+    LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer);
+    return 0;
+}
+
+void* LZ4_create (char* inputBuffer)
+{
+    void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64);
+    LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer);
+    return lz4ds;
+}
+
+char* LZ4_slideInputBuffer (void* LZ4_Data)
+{
+    LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data;
+    int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB);
+    return (char*)(ctx->bufferStart + dictSize);
+}
+
+/* Obsolete streaming decompression functions */
+
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
+}
+
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
+}
+
+#endif   /* LZ4_COMMONDEFS_ONLY */
+
diff --git a/c-blosc/internal-complibs/lz4-1.7.2/lz4.h b/c-blosc/internal-complibs/lz4-1.7.2/lz4.h
new file mode 100644
index 000000000..96e25a661
--- /dev/null
+++ b/c-blosc/internal-complibs/lz4-1.7.2/lz4.h
@@ -0,0 +1,360 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Header File
+   Copyright (C) 2011-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/Cyan4973/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * lz4.h provides block compression functions, and gives full buffer control to programmer.
+ * If you need to generate inter-operable compressed data (respecting LZ4 frame specification),
+ * and can let the library handle its own memory, please use lz4frame.h instead.
+*/
+
+/**************************************
+*  Version
+**************************************/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR    7    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+int LZ4_versionNumber (void);
+
+/**************************************
+*  Tuning parameter
+**************************************/
+/*
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio
+ * Reduced memory usage can improve speed, due to cache effect
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#define LZ4_MEMORY_USAGE 14
+
+
+/**************************************
+*  Simple Functions
+**************************************/
+
+int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize);
+int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize);
+
+/*
+LZ4_compress_default() :
+    Compresses 'sourceSize' bytes from buffer 'source'
+    into already allocated 'dest' buffer of size 'maxDestSize'.
+    Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize).
+    It also runs faster, so it's a recommended setting.
+    If the function cannot compress 'source' into a more limited 'dest' budget,
+    compression stops *immediately*, and the function result is zero.
+    As a consequence, 'dest' content is not valid.
+    This function never writes outside 'dest' buffer, nor read outside 'source' buffer.
+        sourceSize  : Max supported value is LZ4_MAX_INPUT_VALUE
+        maxDestSize : full or partial size of buffer 'dest' (which must be already allocated)
+        return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize)
+              or 0 if compression fails
+
+LZ4_decompress_safe() :
+    compressedSize : is the precise full size of the compressed block.
+    maxDecompressedSize : is the size of destination buffer, which must be already allocated.
+    return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize)
+             If destination buffer is not large enough, decoding will stop and output an error code (<0).
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function is protected against buffer overflow exploits, including malicious data packets.
+             It never writes outside output buffer, nor reads outside input buffer.
+*/
+
+
+/**************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*
+LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
+*/
+int LZ4_compressBound(int inputSize);
+
+/*
+LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows to select an "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1.
+*/
+int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration);
+
+
+/*
+LZ4_compress_fast_extState() :
+    Same compression function, just using an externally allocated memory space to store compression state.
+    Use LZ4_sizeofState() to know how much memory must be allocated,
+    and allocate it on 8-bytes boundaries (using malloc() typically).
+    Then, provide it as 'void* state' to compression function.
+*/
+int LZ4_sizeofState(void);
+int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration);
+
+
+/*
+LZ4_compress_destSize() :
+    Reverse the logic, by compressing as much data as possible from 'source' buffer
+    into already allocated buffer 'dest' of size 'targetDestSize'.
+    This function either compresses the entire 'source' content into 'dest' if it's large enough,
+    or fill 'dest' buffer completely with as much data as possible from 'source'.
+        *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'.
+                         New value is necessarily <= old value.
+        return : Nb bytes written into 'dest' (necessarily <= targetDestSize)
+              or 0 if compression fails
+*/
+int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize);
+
+
+/*
+LZ4_decompress_fast() :
+    originalSize : is the original and therefore uncompressed size
+    return : the number of bytes read from the source buffer (in other words, the compressed size)
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes.
+    note : This function fully respect memory boundaries for properly formed compressed data.
+           It is a bit faster than LZ4_decompress_safe().
+           However, it does not provide any protection against intentionally modified data stream (malicious input).
+           Use this function in trusted environment only (data to decode comes from a trusted source).
+*/
+int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
+
+/*
+LZ4_decompress_safe_partial() :
+    This function decompress a compressed block of size 'compressedSize' at position 'source'
+    into destination buffer 'dest' of size 'maxDecompressedSize'.
+    The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
+    reducing decompression time.
+    return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize)
+       Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
+             Always control how many bytes were decoded.
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+*/
+int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize);
+
+
+/***********************************************
+*  Streaming Compression Functions
+***********************************************/
+#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
+#define LZ4_STREAMSIZE     (LZ4_STREAMSIZE_U64 * sizeof(long long))
+/*
+ * LZ4_stream_t
+ * information structure to track an LZ4 stream.
+ * important : init this structure content before first use !
+ * note : only allocated directly the structure if you are statically linking LZ4
+ *        If you are using liblz4 as a DLL, please use below construction methods instead.
+ */
+typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t;
+
+/*
+ * LZ4_resetStream
+ * Use this function to init an allocated LZ4_stream_t structure
+ */
+void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+/*
+ * LZ4_createStream will allocate and initialize an LZ4_stream_t structure
+ * LZ4_freeStream releases its memory.
+ * In the context of a DLL (liblz4), please use these methods rather than the static struct.
+ * They are more future proof, in case of a change of LZ4_stream_t size.
+ */
+LZ4_stream_t* LZ4_createStream(void);
+int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+
+/*
+ * LZ4_loadDict
+ * Use this function to load a static dictionary into LZ4_stream.
+ * Any previous data will be forgotten, only 'dictionary' will remain in memory.
+ * Loading a size of 0 is allowed.
+ * Return : dictionary size, in bytes (necessarily <= 64 KB)
+ */
+int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*
+ * LZ4_compress_fast_continue
+ * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio.
+ * Important : Previous data blocks are assumed to still be present and unmodified !
+ * 'dst' buffer must be already allocated.
+ * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero.
+ */
+int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration);
+
+/*
+ * LZ4_saveDict
+ * If previously compressed data block is not guaranteed to remain available at its memory location
+ * save it into a safer place (char* safeBuffer)
+ * Note : you don't need to call LZ4_loadDict() afterwards,
+ *        dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue()
+ * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error
+ */
+int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize);
+
+
+/************************************************
+*  Streaming Decompression Functions
+************************************************/
+
+#define LZ4_STREAMDECODESIZE_U64  4
+#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
+typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t;
+/*
+ * LZ4_streamDecode_t
+ * information structure to track an LZ4 stream.
+ * init this structure content using LZ4_setStreamDecode or memset() before first use !
+ *
+ * In the context of a DLL (liblz4) please prefer usage of construction methods below.
+ * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future.
+ * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure
+ * LZ4_freeStreamDecode releases its memory.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary.
+ * Setting a size of 0 is allowed (same effect as reset).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB)
+    In the case of a ring buffers, decoding buffer must be either :
+    - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions)
+      In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
+    - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+      maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block.
+      In which case, encoding and decoding buffers do not need to be synchronized,
+      and encoding ring buffer can have any size, including small ones ( < 64 KB).
+    - _At least_ 64 KB + 8 bytes + maxBlockSize.
+      In which case, encoding and decoding buffers do not need to be synchronized,
+      and encoding ring buffer can have any size, including larger than decoding buffer.
+    Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
+    and indicate where it is saved using LZ4_setStreamDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize);
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize);
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as
+    a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue()
+    They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure.
+*/
+int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize);
+int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
+
+
+/**************************************
+*  Obsolete Functions
+**************************************/
+/* Deprecate Warnings */
+/* Should these warnings messages be a problem,
+   it is generally possible to disable them,
+   with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual for example.
+   Otherwise, you can also define LZ4_DISABLE_DEPRECATE_WARNINGS */
+#define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4_DEPRECATED()   /* disable deprecation warnings */
+#else
+#  if (LZ4_GCC_VERSION >= 405) || defined(__clang__)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (LZ4_GCC_VERSION >= 301)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
+#    define LZ4_DEPRECATED(message)
+#  endif
+#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
+
+/* Obsolete compression functions */
+/* These functions will generate warnings in a future release */
+int LZ4_compress               (const char* source, char* dest, int sourceSize);
+int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize);
+int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/* Obsolete decompression functions */
+/* These function names are completely deprecated and must no longer be used.
+   They are only provided in lz4.c for compatibility with older programs.
+    - LZ4_uncompress is the same as LZ4_decompress_fast
+    - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
+   These function prototypes are now disabled; uncomment them only if you really need them.
+   It is highly recommended to stop using these prototypes and migrate to maintained ones */
+/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */
+/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */
+
+/* Obsolete streaming functions; use new streaming interface whenever possible */
+LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_createStream() instead") int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("use LZ4_resetStream() instead")  int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_saveDict() instead")     char* LZ4_slideInputBuffer (void* state);
+
+/* Obsolete streaming decoding functions */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/c-blosc/internal-complibs/lz4-1.7.2/lz4hc.c b/c-blosc/internal-complibs/lz4-1.7.2/lz4hc.c
new file mode 100644
index 000000000..80bfa3967
--- /dev/null
+++ b/c-blosc/internal-complibs/lz4-1.7.2/lz4hc.c
@@ -0,0 +1,748 @@
+/*
+    LZ4 HC - High Compression Mode of LZ4
+    Copyright (C) 2011-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+       - LZ4 source repository : https://github.com/Cyan4973/lz4
+       - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+
+
+/* *************************************
+*  Tuning Parameter
+***************************************/
+static const int LZ4HC_compressionLevel_default = 9;
+
+/*!
+ * HEAPMODE :
+ * Select how default compression function will allocate workplace memory,
+ * in stack (0:fastest), or in heap (1:requires malloc()).
+ * Since workplace is rather large, heap mode is recommended.
+ */
+#define LZ4HC_HEAPMODE 0
+
+
+/* *************************************
+*  Includes
+***************************************/
+#include "lz4hc.h"
+
+
+/* *************************************
+*  Local Compiler Options
+***************************************/
+#if defined(__GNUC__)
+#  pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#if defined (__clang__)
+#  pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
+
+/* *************************************
+*  Common LZ4 definition
+***************************************/
+#define LZ4_COMMONDEFS_ONLY
+#include "lz4.c"
+
+
+/* *************************************
+*  Local Constants
+***************************************/
+#define DICTIONARY_LOGSIZE 16
+#define MAXD (1<<DICTIONARY_LOGSIZE)
+#define MAXD_MASK (MAXD - 1)
+
+#define HASH_LOG (DICTIONARY_LOGSIZE-1)
+#define HASHTABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASHTABLESIZE - 1)
+
+#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH)
+
+static const int g_maxCompressionLevel = 16;
+
+
+/**************************************
+*  Local Types
+**************************************/
+typedef struct
+{
+    U32   hashTable[HASHTABLESIZE];
+    U16   chainTable[MAXD];
+    const BYTE* end;        /* next block here to continue on current prefix */
+    const BYTE* base;       /* All index relative to this position */
+    const BYTE* dictBase;   /* alternate base for extDict */
+    BYTE* inputBuffer;      /* deprecated */
+    U32   dictLimit;        /* below that point, need extDict */
+    U32   lowLimit;         /* below that point, no more dict */
+    U32   nextToUpdate;     /* index from which to continue dictionary update */
+    U32   compressionLevel;
+} LZ4HC_Data_Structure;
+
+
+/**************************************
+*  Local Macros
+**************************************/
+#define HASH_FUNCTION(i)       (((i) * 2654435761U) >> ((MINMATCH*8)-HASH_LOG))
+//#define DELTANEXTU16(p)        chainTable[(p) & MAXD_MASK]   /* flexible, MAXD dependent */
+#define DELTANEXTU16(p)        chainTable[(U16)(p)]   /* faster */
+
+static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(LZ4_read32(ptr)); }
+
+
+
+/**************************************
+*  HC Compression
+**************************************/
+static void LZ4HC_init (LZ4HC_Data_Structure* hc4, const BYTE* start)
+{
+    MEM_INIT((void*)hc4->hashTable, 0, sizeof(hc4->hashTable));
+    MEM_INIT(hc4->chainTable, 0xFF, sizeof(hc4->chainTable));
+    hc4->nextToUpdate = 64 KB;
+    hc4->base = start - 64 KB;
+    hc4->end = start;
+    hc4->dictBase = start - 64 KB;
+    hc4->dictLimit = 64 KB;
+    hc4->lowLimit = 64 KB;
+}
+
+
+/* Update chains up to ip (excluded) */
+FORCE_INLINE void LZ4HC_Insert (LZ4HC_Data_Structure* hc4, const BYTE* ip)
+{
+    U16* chainTable = hc4->chainTable;
+    U32* HashTable  = hc4->hashTable;
+    const BYTE* const base = hc4->base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = hc4->nextToUpdate;
+
+    while(idx < target)
+    {
+        U32 h = LZ4HC_hashPtr(base+idx);
+        size_t delta = idx - HashTable[h];
+        if (delta>MAX_DISTANCE) delta = MAX_DISTANCE;
+        DELTANEXTU16(idx) = (U16)delta;
+        HashTable[h] = idx;
+        idx++;
+    }
+
+    hc4->nextToUpdate = target;
+}
+
+
+FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4,   /* Index table will be updated */
+                                               const BYTE* ip, const BYTE* const iLimit,
+                                               const BYTE** matchpos,
+                                               const int maxNbAttempts)
+{
+    U16* const chainTable = hc4->chainTable;
+    U32* const HashTable = hc4->hashTable;
+    const BYTE* const base = hc4->base;
+    const BYTE* const dictBase = hc4->dictBase;
+    const U32 dictLimit = hc4->dictLimit;
+    const U32 lowLimit = (hc4->lowLimit + 64 KB > (U32)(ip-base)) ? hc4->lowLimit : (U32)(ip - base) - (64 KB - 1);
+    U32 matchIndex;
+    const BYTE* match;
+    int nbAttempts=maxNbAttempts;
+    size_t ml=0;
+
+    /* HC4 match finder */
+    LZ4HC_Insert(hc4, ip);
+    matchIndex = HashTable[LZ4HC_hashPtr(ip)];
+
+    while ((matchIndex>=lowLimit) && (nbAttempts))
+    {
+        nbAttempts--;
+        if (matchIndex >= dictLimit)
+        {
+            match = base + matchIndex;
+            if (*(match+ml) == *(ip+ml)
+                && (LZ4_read32(match) == LZ4_read32(ip)))
+            {
+                size_t mlt = LZ4_count(ip+MINMATCH, match+MINMATCH, iLimit) + MINMATCH;
+                if (mlt > ml) { ml = mlt; *matchpos = match; }
+            }
+        }
+        else
+        {
+            match = dictBase + matchIndex;
+            if (LZ4_read32(match) == LZ4_read32(ip))
+            {
+                size_t mlt;
+                const BYTE* vLimit = ip + (dictLimit - matchIndex);
+                if (vLimit > iLimit) vLimit = iLimit;
+                mlt = LZ4_count(ip+MINMATCH, match+MINMATCH, vLimit) + MINMATCH;
+                if ((ip+mlt == vLimit) && (vLimit < iLimit))
+                    mlt += LZ4_count(ip+mlt, base+dictLimit, iLimit);
+                if (mlt > ml) { ml = mlt; *matchpos = base + matchIndex; }   /* virtual matchpos */
+            }
+        }
+        matchIndex -= DELTANEXTU16(matchIndex);
+    }
+
+    return (int)ml;
+}
+
+
+FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch (
+    LZ4HC_Data_Structure* hc4,
+    const BYTE* const ip,
+    const BYTE* const iLowLimit,
+    const BYTE* const iHighLimit,
+    int longest,
+    const BYTE** matchpos,
+    const BYTE** startpos,
+    const int maxNbAttempts)
+{
+    U16* const chainTable = hc4->chainTable;
+    U32* const HashTable = hc4->hashTable;
+    const BYTE* const base = hc4->base;
+    const U32 dictLimit = hc4->dictLimit;
+    const BYTE* const lowPrefixPtr = base + dictLimit;
+    const U32 lowLimit = (hc4->lowLimit + 64 KB > (U32)(ip-base)) ? hc4->lowLimit : (U32)(ip - base) - (64 KB - 1);
+    const BYTE* const dictBase = hc4->dictBase;
+    U32   matchIndex;
+    int nbAttempts = maxNbAttempts;
+    int delta = (int)(ip-iLowLimit);
+
+
+    /* First Match */
+    LZ4HC_Insert(hc4, ip);
+    matchIndex = HashTable[LZ4HC_hashPtr(ip)];
+
+    while ((matchIndex>=lowLimit) && (nbAttempts))
+    {
+        nbAttempts--;
+        if (matchIndex >= dictLimit)
+        {
+            const BYTE* matchPtr = base + matchIndex;
+            if (*(iLowLimit + longest) == *(matchPtr - delta + longest))
+                if (LZ4_read32(matchPtr) == LZ4_read32(ip))
+                {
+                    int mlt = MINMATCH + LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, iHighLimit);
+                    int back = 0;
+
+                    while ((ip+back>iLowLimit)
+                           && (matchPtr+back > lowPrefixPtr)
+                           && (ip[back-1] == matchPtr[back-1]))
+                            back--;
+
+                    mlt -= back;
+
+                    if (mlt > longest)
+                    {
+                        longest = (int)mlt;
+                        *matchpos = matchPtr+back;
+                        *startpos = ip+back;
+                    }
+                }
+        }
+        else
+        {
+            const BYTE* matchPtr = dictBase + matchIndex;
+            if (LZ4_read32(matchPtr) == LZ4_read32(ip))
+            {
+                size_t mlt;
+                int back=0;
+                const BYTE* vLimit = ip + (dictLimit - matchIndex);
+                if (vLimit > iHighLimit) vLimit = iHighLimit;
+                mlt = LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH;
+                if ((ip+mlt == vLimit) && (vLimit < iHighLimit))
+                    mlt += LZ4_count(ip+mlt, base+dictLimit, iHighLimit);
+                while ((ip+back > iLowLimit) && (matchIndex+back > lowLimit) && (ip[back-1] == matchPtr[back-1])) back--;
+                mlt -= back;
+                if ((int)mlt > longest) { longest = (int)mlt; *matchpos = base + matchIndex + back; *startpos = ip+back; }
+            }
+        }
+        matchIndex -= DELTANEXTU16(matchIndex);
+    }
+
+    return longest;
+}
+
+
+typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
+
+#define LZ4HC_DEBUG 0
+#if LZ4HC_DEBUG
+static unsigned debug = 0;
+#endif
+
+FORCE_INLINE int LZ4HC_encodeSequence (
+    const BYTE** ip,
+    BYTE** op,
+    const BYTE** anchor,
+    int matchLength,
+    const BYTE* const match,
+    limitedOutput_directive limitedOutputBuffer,
+    BYTE* oend)
+{
+    int length;
+    BYTE* token;
+
+#if LZ4HC_DEBUG
+    if (debug) printf("literal : %u  --  match : %u  --  offset : %u\n", (U32)(*ip - *anchor), (U32)matchLength, (U32)(*ip-match));
+#endif
+
+    /* Encode Literal length */
+    length = (int)(*ip - *anchor);
+    token = (*op)++;
+    if ((limitedOutputBuffer) && ((*op + (length>>8) + length + (2 + 1 + LASTLITERALS)) > oend)) return 1;   /* Check output limit */
+    if (length>=(int)RUN_MASK) { int len; *token=(RUN_MASK<<ML_BITS); len = length-RUN_MASK; for(; len > 254 ; len-=255) *(*op)++ = 255;  *(*op)++ = (BYTE)len; }
+    else *token = (BYTE)(length<<ML_BITS);
+
+    /* Copy Literals */
+    LZ4_wildCopy(*op, *anchor, (*op) + length);
+    *op += length;
+
+    /* Encode Offset */
+    LZ4_writeLE16(*op, (U16)(*ip-match)); *op += 2;
+
+    /* Encode MatchLength */
+    length = (int)(matchLength-MINMATCH);
+    if ((limitedOutputBuffer) && (*op + (length>>8) + (1 + LASTLITERALS) > oend)) return 1;   /* Check output limit */
+    if (length>=(int)ML_MASK) { *token+=ML_MASK; length-=ML_MASK; for(; length > 509 ; length-=510) { *(*op)++ = 255; *(*op)++ = 255; } if (length > 254) { length-=255; *(*op)++ = 255; } *(*op)++ = (BYTE)length; }
+    else *token += (BYTE)(length);
+
+    /* Prepare next loop */
+    *ip += matchLength;
+    *anchor = *ip;
+
+    return 0;
+}
+
+
+static int LZ4HC_compress_generic (
+    void* ctxvoid,
+    const char* source,
+    char* dest,
+    int inputSize,
+    int maxOutputSize,
+    int compressionLevel,
+    limitedOutput_directive limit
+    )
+{
+    LZ4HC_Data_Structure* ctx = (LZ4HC_Data_Structure*) ctxvoid;
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = (iend - LASTLITERALS);
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + maxOutputSize;
+
+    unsigned maxNbAttempts;
+    int   ml, ml2, ml3, ml0;
+    const BYTE* ref=NULL;
+    const BYTE* start2=NULL;
+    const BYTE* ref2=NULL;
+    const BYTE* start3=NULL;
+    const BYTE* ref3=NULL;
+    const BYTE* start0;
+    const BYTE* ref0;
+
+
+    /* init */
+    if (compressionLevel > g_maxCompressionLevel) compressionLevel = g_maxCompressionLevel;
+    if (compressionLevel < 1) compressionLevel = LZ4HC_compressionLevel_default;
+    maxNbAttempts = 1 << (compressionLevel-1);
+    ctx->end += inputSize;
+
+    ip++;
+
+    /* Main Loop */
+    while (ip < mflimit)
+    {
+        ml = LZ4HC_InsertAndFindBestMatch (ctx, ip, matchlimit, (&ref), maxNbAttempts);
+        if (!ml) { ip++; continue; }
+
+        /* saved, in case we would skip too much */
+        start0 = ip;
+        ref0 = ref;
+        ml0 = ml;
+
+_Search2:
+        if (ip+ml < mflimit)
+            ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 1, matchlimit, ml, &ref2, &start2, maxNbAttempts);
+        else ml2 = ml;
+
+        if (ml2 == ml)  /* No better match */
+        {
+            if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0;
+            continue;
+        }
+
+        if (start0 < ip)
+        {
+            if (start2 < ip + ml0)   /* empirical */
+            {
+                ip = start0;
+                ref = ref0;
+                ml = ml0;
+            }
+        }
+
+        /* Here, start0==ip */
+        if ((start2 - ip) < 3)   /* First Match too small : removed */
+        {
+            ml = ml2;
+            ip = start2;
+            ref =ref2;
+            goto _Search2;
+        }
+
+_Search3:
+        /*
+        * Currently we have :
+        * ml2 > ml1, and
+        * ip1+3 <= ip2 (usually < ip1+ml1)
+        */
+        if ((start2 - ip) < OPTIMAL_ML)
+        {
+            int correction;
+            int new_ml = ml;
+            if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML;
+            if (ip+new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
+            correction = new_ml - (int)(start2 - ip);
+            if (correction > 0)
+            {
+                start2 += correction;
+                ref2 += correction;
+                ml2 -= correction;
+            }
+        }
+        /* Now, we have start2 = ip+new_ml, with new_ml = min(ml, OPTIMAL_ML=18) */
+
+        if (start2 + ml2 < mflimit)
+            ml3 = LZ4HC_InsertAndGetWiderMatch(ctx, start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3, maxNbAttempts);
+        else ml3 = ml2;
+
+        if (ml3 == ml2) /* No better match : 2 sequences to encode */
+        {
+            /* ip & ref are known; Now for ml */
+            if (start2 < ip+ml)  ml = (int)(start2 - ip);
+            /* Now, encode 2 sequences */
+            if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0;
+            ip = start2;
+            if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml2, ref2, limit, oend)) return 0;
+            continue;
+        }
+
+        if (start3 < ip+ml+3) /* Not enough space for match 2 : remove it */
+        {
+            if (start3 >= (ip+ml)) /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */
+            {
+                if (start2 < ip+ml)
+                {
+                    int correction = (int)(ip+ml - start2);
+                    start2 += correction;
+                    ref2 += correction;
+                    ml2 -= correction;
+                    if (ml2 < MINMATCH)
+                    {
+                        start2 = start3;
+                        ref2 = ref3;
+                        ml2 = ml3;
+                    }
+                }
+
+                if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0;
+                ip  = start3;
+                ref = ref3;
+                ml  = ml3;
+
+                start0 = start2;
+                ref0 = ref2;
+                ml0 = ml2;
+                goto _Search2;
+            }
+
+            start2 = start3;
+            ref2 = ref3;
+            ml2 = ml3;
+            goto _Search3;
+        }
+
+        /*
+        * OK, now we have 3 ascending matches; let's write at least the first one
+        * ip & ref are known; Now for ml
+        */
+        if (start2 < ip+ml)
+        {
+            if ((start2 - ip) < (int)ML_MASK)
+            {
+                int correction;
+                if (ml > OPTIMAL_ML) ml = OPTIMAL_ML;
+                if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH;
+                correction = ml - (int)(start2 - ip);
+                if (correction > 0)
+                {
+                    start2 += correction;
+                    ref2 += correction;
+                    ml2 -= correction;
+                }
+            }
+            else
+            {
+                ml = (int)(start2 - ip);
+            }
+        }
+        if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0;
+
+        ip = start2;
+        ref = ref2;
+        ml = ml2;
+
+        start2 = start3;
+        ref2 = ref3;
+        ml2 = ml3;
+
+        goto _Search3;
+    }
+
+    /* Encode Last Literals */
+    {
+        int lastRun = (int)(iend - anchor);
+        if ((limit) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0;  /* Check output limit */
+        if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
+        else *op++ = (BYTE)(lastRun<<ML_BITS);
+        memcpy(op, anchor, iend - anchor);
+        op += iend-anchor;
+    }
+
+    /* End */
+    return (int) (((char*)op)-dest);
+}
+
+
+int LZ4_sizeofStateHC(void) { return sizeof(LZ4HC_Data_Structure); }
+
+int LZ4_compress_HC_extStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel)
+{
+    if (((size_t)(state)&(sizeof(void*)-1)) != 0) return 0;   /* Error : state is not aligned for pointers (32 or 64 bits) */
+    LZ4HC_init ((LZ4HC_Data_Structure*)state, (const BYTE*)src);
+    if (maxDstSize < LZ4_compressBound(srcSize))
+        return LZ4HC_compress_generic (state, src, dst, srcSize, maxDstSize, compressionLevel, limitedOutput);
+    else
+        return LZ4HC_compress_generic (state, src, dst, srcSize, maxDstSize, compressionLevel, noLimit);
+}
+
+int LZ4_compress_HC(const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel)
+{
+#if LZ4HC_HEAPMODE==1
+    LZ4HC_Data_Structure* statePtr = malloc(sizeof(LZ4HC_Data_Structure));
+#else
+    LZ4HC_Data_Structure state;
+    LZ4HC_Data_Structure* const statePtr = &state;
+#endif
+    int cSize = LZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, maxDstSize, compressionLevel);
+#if LZ4HC_HEAPMODE==1
+    free(statePtr);
+#endif
+    return cSize;
+}
+
+
+
+/**************************************
+*  Streaming Functions
+**************************************/
+/* allocation */
+LZ4_streamHC_t* LZ4_createStreamHC(void) { return (LZ4_streamHC_t*)malloc(sizeof(LZ4_streamHC_t)); }
+int             LZ4_freeStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr) { free(LZ4_streamHCPtr); return 0; }
+
+
+/* initialization */
+void LZ4_resetStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
+{
+    LZ4_STATIC_ASSERT(sizeof(LZ4HC_Data_Structure) <= sizeof(LZ4_streamHC_t));   /* if compilation fails here, LZ4_STREAMHCSIZE must be increased */
+    ((LZ4HC_Data_Structure*)LZ4_streamHCPtr)->base = NULL;
+    ((LZ4HC_Data_Structure*)LZ4_streamHCPtr)->compressionLevel = (unsigned)compressionLevel;
+}
+
+int LZ4_loadDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, const char* dictionary, int dictSize)
+{
+    LZ4HC_Data_Structure* ctxPtr = (LZ4HC_Data_Structure*) LZ4_streamHCPtr;
+    if (dictSize > 64 KB)
+    {
+        dictionary += dictSize - 64 KB;
+        dictSize = 64 KB;
+    }
+    LZ4HC_init (ctxPtr, (const BYTE*)dictionary);
+    if (dictSize >= 4) LZ4HC_Insert (ctxPtr, (const BYTE*)dictionary +(dictSize-3));
+    ctxPtr->end = (const BYTE*)dictionary + dictSize;
+    return dictSize;
+}
+
+
+/* compression */
+
+static void LZ4HC_setExternalDict(LZ4HC_Data_Structure* ctxPtr, const BYTE* newBlock)
+{
+    if (ctxPtr->end >= ctxPtr->base + 4)
+        LZ4HC_Insert (ctxPtr, ctxPtr->end-3);   /* Referencing remaining dictionary content */
+    /* Only one memory segment for extDict, so any previous extDict is lost at this stage */
+    ctxPtr->lowLimit  = ctxPtr->dictLimit;
+    ctxPtr->dictLimit = (U32)(ctxPtr->end - ctxPtr->base);
+    ctxPtr->dictBase  = ctxPtr->base;
+    ctxPtr->base = newBlock - ctxPtr->dictLimit;
+    ctxPtr->end  = newBlock;
+    ctxPtr->nextToUpdate = ctxPtr->dictLimit;   /* match referencing will resume from there */
+}
+
+static int LZ4_compressHC_continue_generic (LZ4HC_Data_Structure* ctxPtr,
+                                            const char* source, char* dest,
+                                            int inputSize, int maxOutputSize, limitedOutput_directive limit)
+{
+    /* auto-init if forgotten */
+    if (ctxPtr->base == NULL)
+        LZ4HC_init (ctxPtr, (const BYTE*) source);
+
+    /* Check overflow */
+    if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 GB)
+    {
+        size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->base) - ctxPtr->dictLimit;
+        if (dictSize > 64 KB) dictSize = 64 KB;
+
+        LZ4_loadDictHC((LZ4_streamHC_t*)ctxPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize);
+    }
+
+    /* Check if blocks follow each other */
+    if ((const BYTE*)source != ctxPtr->end)
+        LZ4HC_setExternalDict(ctxPtr, (const BYTE*)source);
+
+    /* Check overlapping input/dictionary space */
+    {
+        const BYTE* sourceEnd = (const BYTE*) source + inputSize;
+        const BYTE* dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit;
+        const BYTE* dictEnd   = ctxPtr->dictBase + ctxPtr->dictLimit;
+        if ((sourceEnd > dictBegin) && ((const BYTE*)source < dictEnd))
+        {
+            if (sourceEnd > dictEnd) sourceEnd = dictEnd;
+            ctxPtr->lowLimit = (U32)(sourceEnd - ctxPtr->dictBase);
+            if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) ctxPtr->lowLimit = ctxPtr->dictLimit;
+        }
+    }
+
+    return LZ4HC_compress_generic (ctxPtr, source, dest, inputSize, maxOutputSize, ctxPtr->compressionLevel, limit);
+}
+
+int LZ4_compress_HC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    if (maxOutputSize < LZ4_compressBound(inputSize))
+        return LZ4_compressHC_continue_generic ((LZ4HC_Data_Structure*)LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, limitedOutput);
+    else
+        return LZ4_compressHC_continue_generic ((LZ4HC_Data_Structure*)LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, noLimit);
+}
+
+
+/* dictionary saving */
+
+int LZ4_saveDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, char* safeBuffer, int dictSize)
+{
+    LZ4HC_Data_Structure* streamPtr = (LZ4HC_Data_Structure*)LZ4_streamHCPtr;
+    int prefixSize = (int)(streamPtr->end - (streamPtr->base + streamPtr->dictLimit));
+    if (dictSize > 64 KB) dictSize = 64 KB;
+    if (dictSize < 4) dictSize = 0;
+    if (dictSize > prefixSize) dictSize = prefixSize;
+    memmove(safeBuffer, streamPtr->end - dictSize, dictSize);
+    {
+        U32 endIndex = (U32)(streamPtr->end - streamPtr->base);
+        streamPtr->end = (const BYTE*)safeBuffer + dictSize;
+        streamPtr->base = streamPtr->end - endIndex;
+        streamPtr->dictLimit = endIndex - dictSize;
+        streamPtr->lowLimit = endIndex - dictSize;
+        if (streamPtr->nextToUpdate < streamPtr->dictLimit) streamPtr->nextToUpdate = streamPtr->dictLimit;
+    }
+    return dictSize;
+}
+
+
+/***********************************
+*  Deprecated Functions
+***********************************/
+/* Deprecated compression functions */
+/* These functions are planned to start generate warnings by r131 approximately */
+int LZ4_compressHC(const char* src, char* dst, int srcSize) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), 0); }
+int LZ4_compressHC_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, 0); }
+int LZ4_compressHC2(const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); }
+int LZ4_compressHC2_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize, int cLevel) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, cLevel); }
+int LZ4_compressHC_withStateHC (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_HC_extStateHC (state, src, dst, srcSize, LZ4_compressBound(srcSize), 0); }
+int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_extStateHC (state, src, dst, srcSize, maxDstSize, 0); }
+int LZ4_compressHC2_withStateHC (void* state, const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); }
+int LZ4_compressHC2_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, maxDstSize, cLevel); }
+int LZ4_compressHC_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, LZ4_compressBound(srcSize)); }
+int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, maxDstSize); }
+
+
+/* Deprecated streaming functions */
+/* These functions currently generate deprecation warnings */
+int LZ4_sizeofStreamStateHC(void) { return LZ4_STREAMHCSIZE; }
+
+int LZ4_resetStreamStateHC(void* state, char* inputBuffer)
+{
+    if ((((size_t)state) & (sizeof(void*)-1)) != 0) return 1;   /* Error : pointer is not aligned for pointer (32 or 64 bits) */
+    LZ4HC_init((LZ4HC_Data_Structure*)state, (const BYTE*)inputBuffer);
+    ((LZ4HC_Data_Structure*)state)->inputBuffer = (BYTE*)inputBuffer;
+    return 0;
+}
+
+void* LZ4_createHC (char* inputBuffer)
+{
+    void* hc4 = ALLOCATOR(1, sizeof(LZ4HC_Data_Structure));
+    if (hc4 == NULL) return NULL;   /* not enough memory */
+    LZ4HC_init ((LZ4HC_Data_Structure*)hc4, (const BYTE*)inputBuffer);
+    ((LZ4HC_Data_Structure*)hc4)->inputBuffer = (BYTE*)inputBuffer;
+    return hc4;
+}
+
+int LZ4_freeHC (void* LZ4HC_Data)
+{
+    FREEMEM(LZ4HC_Data);
+    return (0);
+}
+
+int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel)
+{
+    return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, 0, compressionLevel, noLimit);
+}
+
+int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel)
+{
+    return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, maxOutputSize, compressionLevel, limitedOutput);
+}
+
+char* LZ4_slideInputBufferHC(void* LZ4HC_Data)
+{
+    LZ4HC_Data_Structure* hc4 = (LZ4HC_Data_Structure*)LZ4HC_Data;
+    int dictSize = LZ4_saveDictHC((LZ4_streamHC_t*)LZ4HC_Data, (char*)(hc4->inputBuffer), 64 KB);
+    return (char*)(hc4->inputBuffer + dictSize);
+}
diff --git a/c-blosc/internal-complibs/lz4-1.7.2/lz4hc.h b/c-blosc/internal-complibs/lz4-1.7.2/lz4hc.h
new file mode 100644
index 000000000..431f7c87c
--- /dev/null
+++ b/c-blosc/internal-complibs/lz4-1.7.2/lz4hc.h
@@ -0,0 +1,189 @@
+/*
+   LZ4 HC - High Compression Mode of LZ4
+   Header File
+   Copyright (C) 2011-2015, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/Cyan4973/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#pragma once
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*****************************
+*  Includes
+*****************************/
+#include <stddef.h>   /* size_t */
+
+
+/**************************************
+*  Block Compression
+**************************************/
+int LZ4_compress_HC (const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel);
+/*
+LZ4_compress_HC :
+    Destination buffer 'dst' must be already allocated.
+    Compression completion is guaranteed if 'dst' buffer is sized to handle worst circumstances (data not compressible)
+    Worst size evaluation is provided by function LZ4_compressBound() (see "lz4.h")
+      srcSize  : Max supported value is LZ4_MAX_INPUT_SIZE (see "lz4.h")
+      compressionLevel : Recommended values are between 4 and 9, although any value between 0 and 16 will work.
+                         0 means "use default value" (see lz4hc.c).
+                         Values >16 behave the same as 16.
+      return : the number of bytes written into buffer 'dst'
+            or 0 if compression fails.
+*/
+
+
+/* Note :
+   Decompression functions are provided within LZ4 source code (see "lz4.h") (BSD license)
+*/
+
+
+int LZ4_sizeofStateHC(void);
+int LZ4_compress_HC_extStateHC(void* state, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel);
+/*
+LZ4_compress_HC_extStateHC() :
+   Use this function if you prefer to manually allocate memory for compression tables.
+   To know how much memory must be allocated for the compression tables, use :
+      int LZ4_sizeofStateHC();
+
+   Allocated memory must be aligned on 8-bytes boundaries (which a normal malloc() will do properly).
+
+   The allocated memory can then be provided to the compression functions using 'void* state' parameter.
+   LZ4_compress_HC_extStateHC() is equivalent to previously described function.
+   It just uses externally allocated memory for stateHC.
+*/
+
+
+/**************************************
+*  Streaming Compression
+**************************************/
+#define LZ4_STREAMHCSIZE        262192
+#define LZ4_STREAMHCSIZE_SIZET (LZ4_STREAMHCSIZE / sizeof(size_t))
+typedef struct { size_t table[LZ4_STREAMHCSIZE_SIZET]; } LZ4_streamHC_t;
+/*
+  LZ4_streamHC_t
+  This structure allows static allocation of LZ4 HC streaming state.
+  State must then be initialized using LZ4_resetStreamHC() before first use.
+
+  Static allocation should only be used in combination with static linking.
+  If you want to use LZ4 as a DLL, please use construction functions below, which are future-proof.
+*/
+
+
+LZ4_streamHC_t* LZ4_createStreamHC(void);
+int             LZ4_freeStreamHC (LZ4_streamHC_t* streamHCPtr);
+/*
+  These functions create and release memory for LZ4 HC streaming state.
+  Newly created states are already initialized.
+  Existing state space can be re-used anytime using LZ4_resetStreamHC().
+  If you use LZ4 as a DLL, use these functions instead of static structure allocation,
+  to avoid size mismatch between different versions.
+*/
+
+void LZ4_resetStreamHC (LZ4_streamHC_t* streamHCPtr, int compressionLevel);
+int  LZ4_loadDictHC (LZ4_streamHC_t* streamHCPtr, const char* dictionary, int dictSize);
+
+int LZ4_compress_HC_continue (LZ4_streamHC_t* streamHCPtr, const char* src, char* dst, int srcSize, int maxDstSize);
+
+int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSize);
+
+/*
+  These functions compress data in successive blocks of any size, using previous blocks as dictionary.
+  One key assumption is that previous blocks (up to 64 KB) remain read-accessible while compressing next blocks.
+  There is an exception for ring buffers, which can be smaller 64 KB.
+  Such case is automatically detected and correctly handled by LZ4_compress_HC_continue().
+
+  Before starting compression, state must be properly initialized, using LZ4_resetStreamHC().
+  A first "fictional block" can then be designated as initial dictionary, using LZ4_loadDictHC() (Optional).
+
+  Then, use LZ4_compress_HC_continue() to compress each successive block.
+  It works like LZ4_compress_HC(), but use previous memory blocks as dictionary to improve compression.
+  Previous memory blocks (including initial dictionary when present) must remain accessible and unmodified during compression.
+  As a reminder, size 'dst' buffer to handle worst cases, using LZ4_compressBound(), to ensure success of compression operation.
+
+  If, for any reason, previous data blocks can't be preserved unmodified in memory during next compression block,
+  you must save it to a safer memory space, using LZ4_saveDictHC().
+  Return value of LZ4_saveDictHC() is the size of dictionary effectively saved into 'safeBuffer'.
+*/
+
+
+
+/**************************************
+*  Deprecated Functions
+**************************************/
+/* Deprecate Warnings */
+/* Should these warnings messages be a problem,
+   it is generally possible to disable them,
+   with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual for example.
+   You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */
+#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK
+#  define LZ4_DEPRECATE_WARNING_DEFBLOCK
+#  define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if (LZ4_GCC_VERSION >= 405) || defined(__clang__)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (LZ4_GCC_VERSION >= 301)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
+#    define LZ4_DEPRECATED(message)
+#  endif
+#endif // LZ4_DEPRECATE_WARNING_DEFBLOCK
+
+/* compression functions */
+/* these functions are planned to trigger warning messages by r131 approximately */
+int LZ4_compressHC                (const char* source, char* dest, int inputSize);
+int LZ4_compressHC_limitedOutput  (const char* source, char* dest, int inputSize, int maxOutputSize);
+int LZ4_compressHC2               (const char* source, char* dest, int inputSize, int compressionLevel);
+int LZ4_compressHC2_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+int LZ4_compressHC_withStateHC               (void* state, const char* source, char* dest, int inputSize);
+int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+int LZ4_compressHC2_withStateHC              (void* state, const char* source, char* dest, int inputSize, int compressionLevel);
+int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+int LZ4_compressHC_continue               (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize);
+int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/* Streaming functions following the older model; should no longer be used */
+LZ4_DEPRECATED("use LZ4_createStreamHC() instead") void* LZ4_createHC (char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_saveDictHC() instead")     char* LZ4_slideInputBufferHC (void* LZ4HC_Data);
+LZ4_DEPRECATED("use LZ4_freeStreamHC() instead")   int   LZ4_freeHC (void* LZ4HC_Data);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int   LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int   LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_createStreamHC() instead") int   LZ4_sizeofStreamStateHC(void);
+LZ4_DEPRECATED("use LZ4_resetStreamHC() instead")  int   LZ4_resetStreamStateHC(void* state, char* inputBuffer);
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/c-blosc/internal-complibs/lz4-r119/lz4.c b/c-blosc/internal-complibs/lz4-r119/lz4.c
deleted file mode 100644
index 482a8ede3..000000000
--- a/c-blosc/internal-complibs/lz4-r119/lz4.c
+++ /dev/null
@@ -1,1247 +0,0 @@
-/*
-   LZ4 - Fast LZ compression algorithm
-   Copyright (C) 2011-2014, Yann Collet.
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - LZ4 source repository : http://code.google.com/p/lz4/
-   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
-*/
-
-/**************************************
-   Tuning parameters
-**************************************/
-/*
- * HEAPMODE :
- * Select how default compression functions will allocate memory for their hash table,
- * in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)).
- */
-#define HEAPMODE 0
-
-
-/**************************************
-   CPU Feature Detection
-**************************************/
-/* 32 or 64 bits ? */
-#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
-  || defined(__powerpc64__) || defined(__powerpc64le__) \
-  || defined(__ppc64__) || defined(__ppc64le__) \
-  || defined(__PPC64__) || defined(__PPC64LE__) \
-  || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) )   /* Detects 64 bits mode */
-#  define LZ4_ARCH64 1
-#else
-#  define LZ4_ARCH64 0
-#endif
-
-/*
- * Little Endian or Big Endian ?
- * Overwrite the #define below if you know your architecture endianess
- */
-#include <stdlib.h>   /* Apparently required to detect endianess */
-#if defined (__GLIBC__)
-#  include <endian.h>
-#  if (__BYTE_ORDER == __BIG_ENDIAN)
-#     define LZ4_BIG_ENDIAN 1
-#  endif
-#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
-#  define LZ4_BIG_ENDIAN 1
-#elif defined(__sparc) || defined(__sparc__) \
-   || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
-   || defined(__hpux)  || defined(__hppa) \
-   || defined(_MIPSEB) || defined(__s390__)
-#  define LZ4_BIG_ENDIAN 1
-#else
-/* Little Endian assumed. PDP Endian and other very rare endian format are unsupported. */
-#endif
-
-/*
- * Unaligned memory access is automatically enabled for "common" CPU, such as x86.
- * For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property
- * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance
- */
-#if defined(__ARM_FEATURE_UNALIGNED)
-#  define LZ4_FORCE_UNALIGNED_ACCESS 1
-#endif
-
-/* Define this parameter if your target system or compiler does not support hardware bit count */
-#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for Windows CE does not support Hardware bit count */
-#  define LZ4_FORCE_SW_BITCOUNT
-#endif
-
-/*
- * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE :
- * This option may provide a small boost to performance for some big endian cpu, although probably modest.
- * You may set this option to 1 if data will remain within closed environment.
- * This option is useless on Little_Endian CPU (such as x86)
- */
-
-/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
-
-
-/**************************************
- Compiler Options
-**************************************/
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
-/* "restrict" is a known keyword */
-#else
-#  define restrict /* Disable restrict */
-#endif
-
-#ifdef _MSC_VER    /* Visual Studio */
-#  define FORCE_INLINE static __forceinline
-#  include <intrin.h>                    /* For Visual 2005 */
-#  if LZ4_ARCH64   /* 64-bits */
-#    pragma intrinsic(_BitScanForward64) /* For Visual 2005 */
-#    pragma intrinsic(_BitScanReverse64) /* For Visual 2005 */
-#  else            /* 32-bits */
-#    pragma intrinsic(_BitScanForward)   /* For Visual 2005 */
-#    pragma intrinsic(_BitScanReverse)   /* For Visual 2005 */
-#  endif
-#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
-#else
-#  ifdef __GNUC__
-#    define FORCE_INLINE static inline __attribute__((always_inline))
-#  else
-#    define FORCE_INLINE static inline
-#  endif
-#endif
-
-#ifdef _MSC_VER  /* Visual Studio */
-#  define lz4_bswap16(x) _byteswap_ushort(x)
-#else
-#  define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)))
-#endif
-
-#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-
-#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
-#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
-#else
-#  define expect(expr,value)    (expr)
-#endif
-
-#define likely(expr)     expect((expr) != 0, 1)
-#define unlikely(expr)   expect((expr) != 0, 0)
-
-
-/**************************************
-   Memory routines
-**************************************/
-#include <stdlib.h>   /* malloc, calloc, free */
-#define ALLOCATOR(n,s) calloc(n,s)
-#define FREEMEM        free
-#include <string.h>   /* memset, memcpy */
-#define MEM_INIT       memset
-
-
-/**************************************
-   Includes
-**************************************/
-#include "lz4.h"
-
-
-/**************************************
-   Basic Types
-**************************************/
-#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
-# include <stdint.h>
-  typedef  uint8_t BYTE;
-  typedef uint16_t U16;
-  typedef uint32_t U32;
-  typedef  int32_t S32;
-  typedef uint64_t U64;
-#else
-  typedef unsigned char       BYTE;
-  typedef unsigned short      U16;
-  typedef unsigned int        U32;
-  typedef   signed int        S32;
-  typedef unsigned long long  U64;
-#endif
-
-#if defined(__GNUC__)  && !defined(LZ4_FORCE_UNALIGNED_ACCESS)
-#  define _PACKED __attribute__ ((packed))
-#else
-#  define _PACKED
-#endif
-
-#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
-#  if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-#    pragma pack(1)
-#  else
-#    pragma pack(push, 1)
-#  endif
-#endif
-
-typedef struct { U16 v; }  _PACKED U16_S;
-typedef struct { U32 v; }  _PACKED U32_S;
-typedef struct { U64 v; }  _PACKED U64_S;
-typedef struct {size_t v;} _PACKED size_t_S;
-
-#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
-#  if defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-#    pragma pack(0)
-#  else
-#    pragma pack(pop)
-#  endif
-#endif
-
-#define A16(x)   (((U16_S *)(x))->v)
-#define A32(x)   (((U32_S *)(x))->v)
-#define A64(x)   (((U64_S *)(x))->v)
-#define AARCH(x) (((size_t_S *)(x))->v)
-
-
-/**************************************
-   Constants
-**************************************/
-#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
-#define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
-#define HASH_SIZE_U32 (1 << LZ4_HASHLOG)
-
-#define MINMATCH 4
-
-#define COPYLENGTH 8
-#define LASTLITERALS 5
-#define MFLIMIT (COPYLENGTH+MINMATCH)
-static const int LZ4_minLength = (MFLIMIT+1);
-
-#define KB *(1U<<10)
-#define MB *(1U<<20)
-#define GB *(1U<<30)
-
-#define LZ4_64KLIMIT ((64 KB) + (MFLIMIT-1))
-#define SKIPSTRENGTH 6   /* Increasing this value will make the compression run slower on incompressible data */
-
-#define MAXD_LOG 16
-#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
-
-#define ML_BITS  4
-#define ML_MASK  ((1U<<ML_BITS)-1)
-#define RUN_BITS (8-ML_BITS)
-#define RUN_MASK ((1U<<RUN_BITS)-1)
-
-
-/**************************************
-   Structures and local types
-**************************************/
-typedef struct {
-    U32  hashTable[HASH_SIZE_U32];
-    U32  currentOffset;
-    U32  initCheck;
-    const BYTE* dictionary;
-    const BYTE* bufferStart;
-    U32  dictSize;
-} LZ4_stream_t_internal;
-
-typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive;
-typedef enum { byPtr, byU32, byU16 } tableType_t;
-
-typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
-typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
-
-typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
-typedef enum { full = 0, partial = 1 } earlyEnd_directive;
-
-
-/**************************************
-   Architecture-specific macros
-**************************************/
-#define STEPSIZE                  sizeof(size_t)
-#define LZ4_COPYSTEP(d,s)         { AARCH(d) = AARCH(s); d+=STEPSIZE; s+=STEPSIZE; }
-#define LZ4_COPY8(d,s)            { LZ4_COPYSTEP(d,s); if (STEPSIZE<8) LZ4_COPYSTEP(d,s); }
-
-#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
-#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
-#  define LZ4_WRITE_LITTLEENDIAN_16(p,i)  { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p+=2; }
-#else      /* Little Endian */
-#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = (s) - A16(p); }
-#  define LZ4_WRITE_LITTLEENDIAN_16(p,v)  { A16(p) = v; p+=2; }
-#endif
-
-
-/**************************************
-   Macros
-**************************************/
-#define LZ4_STATIC_ASSERT(c)    { enum { LZ4_static_assert = 1/(!!(c)) }; }   /* use only *after* variable declarations */
-#if LZ4_ARCH64 || !defined(__GNUC__)
-#  define LZ4_WILDCOPY(d,s,e)   { do { LZ4_COPY8(d,s) } while (d<e); }        /* at the end, d>=e; */
-#else
-#  define LZ4_WILDCOPY(d,s,e)   { if (likely(e-d <= 8)) LZ4_COPY8(d,s) else do { LZ4_COPY8(d,s) } while (d<e); }
-#endif
-
-
-/****************************
-   Private local functions
-****************************/
-#if LZ4_ARCH64
-
-int LZ4_NbCommonBytes (register U64 val)
-{
-# if defined(LZ4_BIG_ENDIAN)
-#   if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    unsigned long r = 0;
-    _BitScanReverse64( &r, val );
-    return (int)(r>>3);
-#   elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    return (__builtin_clzll(val) >> 3);
-#   else
-    int r;
-    if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
-    if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-    r += (!val);
-    return r;
-#   endif
-# else
-#   if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    unsigned long r = 0;
-    _BitScanForward64( &r, val );
-    return (int)(r>>3);
-#   elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    return (__builtin_ctzll(val) >> 3);
-#   else
-    static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
-    return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#   endif
-# endif
-}
-
-#else
-
-int LZ4_NbCommonBytes (register U32 val)
-{
-# if defined(LZ4_BIG_ENDIAN)
-#   if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    unsigned long r = 0;
-    _BitScanReverse( &r, val );
-    return (int)(r>>3);
-#   elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    return (__builtin_clz(val) >> 3);
-#   else
-    int r;
-    if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
-    r += (!val);
-    return r;
-#   endif
-# else
-#   if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    unsigned long r;
-    _BitScanForward( &r, val );
-    return (int)(r>>3);
-#   elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    return (__builtin_ctz(val) >> 3);
-#   else
-    static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
-    return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#   endif
-# endif
-}
-
-#endif
-
-
-/********************************
-   Compression functions
-********************************/
-int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
-
-static int LZ4_hashSequence(U32 sequence, tableType_t tableType)
-{
-    if (tableType == byU16)
-        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
-    else
-        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
-}
-
-static int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); }
-
-static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
-{
-    switch (tableType)
-    {
-    case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; }
-    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; }
-    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; }
-    }
-}
-
-static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
-{
-    U32 h = LZ4_hashPosition(p, tableType);
-    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
-}
-
-static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
-{
-    if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
-    if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
-    { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
-}
-
-static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
-{
-    U32 h = LZ4_hashPosition(p, tableType);
-    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
-}
-
-static unsigned LZ4_count(const BYTE* pIn, const BYTE* pRef, const BYTE* pInLimit)
-{
-    const BYTE* const pStart = pIn;
-
-    while (likely(pIn<pInLimit-(STEPSIZE-1)))
-    {
-        size_t diff = AARCH(pRef) ^ AARCH(pIn);
-        if (!diff) { pIn+=STEPSIZE; pRef+=STEPSIZE; continue; }
-        pIn += LZ4_NbCommonBytes(diff);
-        return (unsigned)(pIn - pStart);
-    }
-    if (sizeof(void*)==8) if ((pIn<(pInLimit-3)) && (A32(pRef) == A32(pIn))) { pIn+=4; pRef+=4; }
-    if ((pIn<(pInLimit-1)) && (A16(pRef) == A16(pIn))) { pIn+=2; pRef+=2; }
-    if ((pIn<pInLimit) && (*pRef == *pIn)) pIn++;
-
-    return (unsigned)(pIn - pStart);
-}
-
-
-static int LZ4_compress_generic(
-                 void* ctx,
-                 const char* source,
-                 char* dest,
-                 int inputSize,
-                 int maxOutputSize,
-
-                 limitedOutput_directive outputLimited,
-                 tableType_t tableType,
-                 dict_directive dict,
-                 dictIssue_directive dictIssue)
-{
-    LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx;
-
-    const BYTE* ip = (const BYTE*) source;
-    const BYTE* base;
-    const BYTE* lowLimit;
-    const BYTE* const lowRefLimit = ip - dictPtr->dictSize;
-    const BYTE* const dictionary = dictPtr->dictionary;
-    const BYTE* const dictEnd = dictionary + dictPtr->dictSize;
-    const size_t dictDelta = dictEnd - (const BYTE*)source;
-    const BYTE* anchor = (const BYTE*) source;
-    const BYTE* const iend = ip + inputSize;
-    const BYTE* const mflimit = iend - MFLIMIT;
-    const BYTE* const matchlimit = iend - LASTLITERALS;
-
-    BYTE* op = (BYTE*) dest;
-    BYTE* const olimit = op + maxOutputSize;
-
-    const int skipStrength = SKIPSTRENGTH;
-    U32 forwardH;
-    size_t refDelta=0;
-
-    /* Init conditions */
-    if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0;          /* Unsupported input size, too large (or negative) */
-    switch(dict)
-    {
-    case noDict:
-    default:
-        base = (const BYTE*)source;
-        lowLimit = (const BYTE*)source;
-        break;
-    case withPrefix64k:
-        base = (const BYTE*)source - dictPtr->currentOffset;
-        lowLimit = (const BYTE*)source - dictPtr->dictSize;
-        break;
-    case usingExtDict:
-        base = (const BYTE*)source - dictPtr->currentOffset;
-        lowLimit = (const BYTE*)source;
-        break;
-    }
-    if ((tableType == byU16) && (inputSize>=(int)LZ4_64KLIMIT)) return 0;   /* Size too large (not within 64K limit) */
-    if (inputSize<LZ4_minLength) goto _last_literals;                       /* Input too small, no compression (all literals) */
-
-    /* First Byte */
-    LZ4_putPosition(ip, ctx, tableType, base);
-    ip++; forwardH = LZ4_hashPosition(ip, tableType);
-
-    /* Main Loop */
-    for ( ; ; )
-    {
-        const BYTE* ref;
-        BYTE* token;
-        {
-            const BYTE* forwardIp = ip;
-            unsigned step=1;
-            unsigned searchMatchNb = (1U << skipStrength);
-
-            /* Find a match */
-            do {
-                U32 h = forwardH;
-                ip = forwardIp;
-                forwardIp += step;
-                step = searchMatchNb++ >> skipStrength;
-                //if (step>8) step=8;   // required for valid forwardIp ; slows down uncompressible data a bit
-
-                if (unlikely(forwardIp > mflimit)) goto _last_literals;
-
-                ref = LZ4_getPositionOnHash(h, ctx, tableType, base);
-                if (dict==usingExtDict)
-                {
-                    if (ref<(const BYTE*)source)
-                    {
-                        refDelta = dictDelta;
-                        lowLimit = dictionary;
-                    }
-                    else
-                    {
-                        refDelta = 0;
-                        lowLimit = (const BYTE*)source;
-                    }
-                }
-                forwardH = LZ4_hashPosition(forwardIp, tableType);
-                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
-
-            } while ( ((dictIssue==dictSmall) ? (ref < lowRefLimit) : 0)
-                || ((tableType==byU16) ? 0 : (ref + MAX_DISTANCE < ip))
-                || (A32(ref+refDelta) != A32(ip)) );
-        }
-
-        /* Catch up */
-        while ((ip>anchor) && (ref+refDelta > lowLimit) && (unlikely(ip[-1]==ref[refDelta-1]))) { ip--; ref--; }
-
-        {
-            /* Encode Literal length */
-            unsigned litLength = (unsigned)(ip - anchor);
-            token = op++;
-            if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)))
-                return 0;   /* Check output limit */
-            if (litLength>=RUN_MASK)
-            {
-                int len = (int)litLength-RUN_MASK;
-                *token=(RUN_MASK<<ML_BITS);
-                for(; len >= 255 ; len-=255) *op++ = 255;
-                *op++ = (BYTE)len;
-            }
-            else *token = (BYTE)(litLength<<ML_BITS);
-
-            /* Copy Literals */
-            { BYTE* end = op+litLength; LZ4_WILDCOPY(op,anchor,end); op=end; }
-        }
-
-_next_match:
-        /* Encode Offset */
-        LZ4_WRITE_LITTLEENDIAN_16(op, (U16)(ip-ref));
-
-        /* Encode MatchLength */
-        {
-            unsigned matchLength;
-
-            if ((dict==usingExtDict) && (lowLimit==dictionary))
-            {
-                const BYTE* limit;
-                ref += refDelta;
-                limit = ip + (dictEnd-ref);
-                if (limit > matchlimit) limit = matchlimit;
-                matchLength = LZ4_count(ip+MINMATCH, ref+MINMATCH, limit);
-                ip += MINMATCH + matchLength;
-                if (ip==limit)
-                {
-                    unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit);
-                    matchLength += more;
-                    ip += more;
-                }
-            }
-            else
-            {
-                matchLength = LZ4_count(ip+MINMATCH, ref+MINMATCH, matchlimit);
-                ip += MINMATCH + matchLength;
-            }
-
-            if (matchLength>=ML_MASK)
-            {
-                if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit)))
-                    return 0;    /* Check output limit */
-                *token += ML_MASK;
-                matchLength -= ML_MASK;
-                for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; }
-                if (matchLength >= 255) { matchLength-=255; *op++ = 255; }
-                *op++ = (BYTE)matchLength;
-            }
-            else *token += (BYTE)(matchLength);
-        }
-
-        anchor = ip;
-
-        /* Test end of chunk */
-        if (ip > mflimit) break;
-
-        /* Fill table */
-        LZ4_putPosition(ip-2, ctx, tableType, base);
-
-        /* Test next position */
-        ref = LZ4_getPosition(ip, ctx, tableType, base);
-        if (dict==usingExtDict)
-        {
-            if (ref<(const BYTE*)source)
-            {
-                refDelta = dictDelta;
-                lowLimit = dictionary;
-            }
-            else
-            {
-                refDelta = 0;
-                lowLimit = (const BYTE*)source;
-            }
-        }
-        LZ4_putPosition(ip, ctx, tableType, base);
-        if ( ((dictIssue==dictSmall) ? (ref>=lowRefLimit) : 1)
-            && (ref+MAX_DISTANCE>=ip)
-            && (A32(ref+refDelta)==A32(ip)) )
-        { token=op++; *token=0; goto _next_match; }
-
-        /* Prepare next loop */
-        forwardH = LZ4_hashPosition(++ip, tableType);
-    }
-
-_last_literals:
-    /* Encode Last Literals */
-    {
-        int lastRun = (int)(iend - anchor);
-        if ((outputLimited) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize))
-            return 0;   /* Check output limit */
-        if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun >= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
-        else *op++ = (BYTE)(lastRun<<ML_BITS);
-        memcpy(op, anchor, iend - anchor);
-        op += iend-anchor;
-    }
-
-    /* End */
-    return (int) (((char*)op)-dest);
-}
-
-
-int LZ4_compress(const char* source, char* dest, int inputSize)
-{
-#if (HEAPMODE)
-    void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U32, 4);   /* Aligned on 4-bytes boundaries */
-#else
-    U32 ctx[LZ4_STREAMSIZE_U32] = {0};      /* Ensure data is aligned on 4-bytes boundaries */
-#endif
-    int result;
-
-    if (inputSize < (int)LZ4_64KLIMIT)
-        result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue);
-    else
-        result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue);
-
-#if (HEAPMODE)
-    FREEMEM(ctx);
-#endif
-    return result;
-}
-
-int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
-{
-#if (HEAPMODE)
-    void* ctx = ALLOCATOR(LZ4_STREAMSIZE_U32, 4);   /* Aligned on 4-bytes boundaries */
-#else
-    U32 ctx[LZ4_STREAMSIZE_U32] = {0};      /* Ensure data is aligned on 4-bytes boundaries */
-#endif
-    int result;
-
-    if (inputSize < (int)LZ4_64KLIMIT)
-        result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue);
-    else
-        result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limitedOutput, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue);
-
-#if (HEAPMODE)
-    FREEMEM(ctx);
-#endif
-    return result;
-}
-
-
-/*****************************************
-   Experimental : Streaming functions
-*****************************************/
-
-void* LZ4_createStream()
-{
-    void* lz4s = ALLOCATOR(4, LZ4_STREAMSIZE_U32);
-    MEM_INIT(lz4s, 0, LZ4_STREAMSIZE);
-    return lz4s;
-}
-
-int LZ4_free (void* LZ4_stream)
-{
-    FREEMEM(LZ4_stream);
-    return (0);
-}
-
-
-int LZ4_loadDict (void* LZ4_dict, const char* dictionary, int dictSize)
-{
-    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
-    const BYTE* p = (const BYTE*)dictionary;
-    const BYTE* const dictEnd = p + dictSize;
-    const BYTE* base;
-
-    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));      /* A compilation error here means LZ4_STREAMSIZE is not large enough */
-    if (dict->initCheck) MEM_INIT(dict, 0, sizeof(LZ4_stream_t_internal));   /* Uninitialized structure detected */
-
-    if (dictSize < MINMATCH)
-    {
-        dict->dictionary = NULL;
-        dict->dictSize = 0;
-        return 1;
-    }
-
-    if (p <= dictEnd - 64 KB) p = dictEnd - 64 KB;
-    base = p - dict->currentOffset;
-    dict->dictionary = p;
-    dict->dictSize = (U32)(dictEnd - p);
-    dict->currentOffset += dict->dictSize;
-
-    while (p <= dictEnd-MINMATCH)
-    {
-        LZ4_putPosition(p, dict, byU32, base);
-        p+=3;
-    }
-
-    return 1;
-}
-
-
-void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src)
-{
-    if ((LZ4_dict->currentOffset > 0x80000000) ||
-        ((size_t)LZ4_dict->currentOffset > (size_t)src))   /* address space overflow */
-    {
-        /* rescale hash table */
-        U32 delta = LZ4_dict->currentOffset - 64 KB;
-        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
-        int i;
-        for (i=0; i<HASH_SIZE_U32; i++)
-        {
-            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
-            else LZ4_dict->hashTable[i] -= delta;
-        }
-        LZ4_dict->currentOffset = 64 KB;
-        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
-        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
-    }
-}
-
-
-FORCE_INLINE int LZ4_compress_continue_generic (void* LZ4_stream, const char* source, char* dest, int inputSize,
-                                                int maxOutputSize, limitedOutput_directive limit)
-{
-    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream;
-    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
-
-    const BYTE* smallest = (const BYTE*) source;
-    if (streamPtr->initCheck) return 0;   /* Uninitialized structure detected */
-    if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd;
-    LZ4_renormDictT(streamPtr, smallest);
-
-    /* Check overlapping input/dictionary space */
-    {
-        const BYTE* sourceEnd = (const BYTE*) source + inputSize;
-        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd))
-        {
-            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
-            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
-            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
-            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
-        }
-    }
-
-    /* prefix mode : source data follows dictionary */
-    if (dictEnd == (const BYTE*)source)
-    {
-        int result;
-        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
-            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, withPrefix64k, dictSmall);
-        else
-            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, withPrefix64k, noDictIssue);
-        streamPtr->dictSize += (U32)inputSize;
-        streamPtr->currentOffset += (U32)inputSize;
-        return result;
-    }
-
-    /* external dictionary mode */
-    {
-        int result;
-        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
-            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, usingExtDict, dictSmall);
-        else
-            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, usingExtDict, noDictIssue);
-        streamPtr->dictionary = (const BYTE*)source;
-        streamPtr->dictSize = (U32)inputSize;
-        streamPtr->currentOffset += (U32)inputSize;
-        return result;
-    }
-}
-
-
-int LZ4_compress_continue (void* LZ4_stream, const char* source, char* dest, int inputSize)
-{
-    return LZ4_compress_continue_generic(LZ4_stream, source, dest, inputSize, 0, notLimited);
-}
-
-int LZ4_compress_limitedOutput_continue (void* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize)
-{
-    return LZ4_compress_continue_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput);
-}
-
-
-// Hidden debug function, to force separate dictionary mode
-int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize)
-{
-    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict;
-    int result;
-    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
-
-    const BYTE* smallest = dictEnd;
-    if (smallest > (const BYTE*) source) smallest = (const BYTE*) source;
-    LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest);
-
-    result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue);
-
-    streamPtr->dictionary = (const BYTE*)source;
-    streamPtr->dictSize = (U32)inputSize;
-    streamPtr->currentOffset += (U32)inputSize;
-
-    return result;
-}
-
-
-int LZ4_saveDict (void* LZ4_dict, char* safeBuffer, int dictSize)
-{
-    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
-    const BYTE* previousDictEnd = dict->dictionary + dict->dictSize;
-
-    if ((U32)dictSize > 64 KB) dictSize = 64 KB;   /* useless to define a dictionary > 64 KB */
-    if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize;
-
-    memcpy(safeBuffer, previousDictEnd - dictSize, dictSize);
-
-    dict->dictionary = (const BYTE*)safeBuffer;
-    dict->dictSize = (U32)dictSize;
-
-    return 1;
-}
-
-
-
-/****************************
-   Decompression functions
-****************************/
-/*
- * This generic decompression function cover all use cases.
- * It shall be instanciated several times, using different sets of directives
- * Note that it is essential this generic function is really inlined,
- * in order to remove useless branches during compilation optimisation.
- */
-FORCE_INLINE int LZ4_decompress_generic(
-                 const char* source,
-                 char* dest,
-                 int inputSize,
-                 int outputSize,         /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */
-
-                 int endOnInput,         /* endOnOutputSize, endOnInputSize */
-                 int partialDecoding,    /* full, partial */
-                 int targetOutputSize,   /* only used if partialDecoding==partial */
-                 int dict,               /* noDict, withPrefix64k, usingExtDict */
-                 const char* dictStart,  /* only if dict==usingExtDict */
-                 int dictSize            /* note : = 0 if noDict */
-                 )
-{
-    /* Local Variables */
-    const BYTE* restrict ip = (const BYTE*) source;
-    const BYTE* ref;
-    const BYTE* const iend = ip + inputSize;
-
-    BYTE* op = (BYTE*) dest;
-    BYTE* const oend = op + outputSize;
-    BYTE* cpy;
-    BYTE* oexit = op + targetOutputSize;
-    const BYTE* const lowLimit = (const BYTE*)dest - dictSize;
-
-    const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
-//#define OLD
-#ifdef OLD
-    const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};   /* static reduces speed for LZ4_decompress_safe() on GCC64 */
-#else
-    const size_t dec32table[] = {4-0, 4-3, 4-2, 4-3, 4-0, 4-0, 4-0, 4-0};   /* static reduces speed for LZ4_decompress_safe() on GCC64 */
-#endif
-    static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
-
-    const int checkOffset = (endOnInput) && (dictSize < (int)(64 KB));
-
-
-    /* Special cases */
-    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;                        /* targetOutputSize too high => decode everything */
-    if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1;   /* Empty output buffer */
-    if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1);
-
-
-    /* Main Loop */
-    while (1)
-    {
-        unsigned token;
-        size_t length;
-
-        /* get runlength */
-        token = *ip++;
-        if ((length=(token>>ML_BITS)) == RUN_MASK)
-        {
-            unsigned s;
-            do
-            {
-                s = *ip++;
-                length += s;
-            }
-            while (likely((endOnInput)?ip<iend-RUN_MASK:1) && (s==255));
-            //if ((sizeof(void*)==4) && unlikely(length>LZ4_MAX_INPUT_SIZE)) goto _output_error;   /* overflow detection */
-            if ((sizeof(void*)==4) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error;   /* quickfix issue 134 */
-            if ((endOnInput) && (sizeof(void*)==4) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error;   /* quickfix issue 134 */
-        }
-
-        /* copy literals */
-        cpy = op+length;
-        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
-            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
-        {
-            if (partialDecoding)
-            {
-                if (cpy > oend) goto _output_error;                           /* Error : write attempt beyond end of output buffer */
-                if ((endOnInput) && (ip+length > iend)) goto _output_error;   /* Error : read attempt beyond end of input buffer */
-            }
-            else
-            {
-                if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
-                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
-            }
-            memcpy(op, ip, length);
-            ip += length;
-            op += length;
-            break;                                       /* Necessarily EOF, due to parsing restrictions */
-        }
-        LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy;
-
-        /* get offset */
-        LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
-        if ((checkOffset) && (unlikely(ref < lowLimit))) goto _output_error;   /* Error : offset outside destination buffer */
-
-        /* get matchlength */
-        if ((length=(token&ML_MASK)) == ML_MASK)
-        {
-            unsigned s;
-            do
-            {
-                if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error;
-                s = *ip++;
-                length += s;
-            } while (s==255);
-            //if ((sizeof(void*)==4) && unlikely(length>LZ4_MAX_INPUT_SIZE)) goto _output_error;   /* overflow detection */
-            if ((sizeof(void*)==4) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error;   /* quickfix issue 134 */
-        }
-
-        /* check external dictionary */
-        if ((dict==usingExtDict) && (ref < (BYTE* const)dest))
-        {
-            if (unlikely(op+length+MINMATCH > oend-LASTLITERALS)) goto _output_error;
-
-            if (length+MINMATCH <= (size_t)(dest-(char*)ref))
-            {
-                ref = dictEnd - (dest-(char*)ref);
-                memcpy(op, ref, length+MINMATCH);
-                op += length+MINMATCH;
-            }
-            else
-            {
-                size_t copySize = (size_t)(dest-(char*)ref);
-                memcpy(op, dictEnd - copySize, copySize);
-                op += copySize;
-                copySize = length+MINMATCH - copySize;
-                if (copySize > (size_t)((char*)op-dest))   /* overlap */
-                {
-                    BYTE* const cpy = op + copySize;
-                    const BYTE* ref = (BYTE*)dest;
-                    while (op < cpy) *op++ = *ref++;
-                }
-                else
-                {
-                    memcpy(op, dest, copySize);
-                    op += copySize;
-                }
-            }
-            continue;
-        }
-
-        /* copy repeated sequence */
-        if (unlikely((op-ref)<(int)STEPSIZE))
-        {
-            const size_t dec64 = dec64table[(sizeof(void*)==4) ? 0 : op-ref];
-            op[0] = ref[0];
-            op[1] = ref[1];
-            op[2] = ref[2];
-            op[3] = ref[3];
-#ifdef OLD
-            op += 4, ref += 4; ref -= dec32table[op-ref];
-            A32(op) = A32(ref);
-            op += STEPSIZE-4; ref -= dec64;
-#else
-            ref += dec32table[op-ref];
-            A32(op+4) = A32(ref);
-            op += STEPSIZE; ref -= dec64;
-#endif
-        } else { LZ4_COPYSTEP(op,ref); }
-        cpy = op + length - (STEPSIZE-4);
-
-        if (unlikely(cpy>oend-COPYLENGTH-(STEPSIZE-4)))
-        {
-            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last 5 bytes must be literals */
-            if (op<oend-COPYLENGTH) LZ4_WILDCOPY(op, ref, (oend-COPYLENGTH));
-            while(op<cpy) *op++=*ref++;
-            op=cpy;
-            continue;
-        }
-        LZ4_WILDCOPY(op, ref, cpy);
-        op=cpy;   /* correction */
-    }
-
-    /* end of decoding */
-    if (endOnInput)
-       return (int) (((char*)op)-dest);     /* Nb of output bytes decoded */
-    else
-       return (int) (((char*)ip)-source);   /* Nb of input bytes read */
-
-    /* Overflow error detected */
-_output_error:
-    return (int) (-(((char*)ip)-source))-1;
-}
-
-
-int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxOutputSize)
-{
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, noDict, NULL, 0);
-}
-
-int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxOutputSize)
-{
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, partial, targetOutputSize, noDict, NULL, 0);
-}
-
-int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
-{
-    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, NULL, 0);
-}
-
-/* streaming decompression functions */
-
-//#define LZ4_STREAMDECODESIZE_U32 4
-//#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U32 * sizeof(unsigned int))
-//typedef struct { unsigned int table[LZ4_STREAMDECODESIZE_U32]; } LZ4_streamDecode_t;
-typedef struct
-{
-    const char* dictionary;
-    int dictSize;
-} LZ4_streamDecode_t_internal;
-
-/*
- * If you prefer dynamic allocation methods,
- * LZ4_createStreamDecode()
- * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
- */
-void* LZ4_createStreamDecode()
-{
-    void* lz4s = ALLOCATOR(sizeof(U32), LZ4_STREAMDECODESIZE_U32);
-    MEM_INIT(lz4s, 0, LZ4_STREAMDECODESIZE);
-    return lz4s;
-}
-
-/*
- * LZ4_setDictDecode
- * Use this function to instruct where to find the dictionary
- * This function is not necessary if previous data is still available where it was decoded.
- * Loading a size of 0 is allowed (same effect as no dictionary).
- * Return : 1 if OK, 0 if error
- */
-int LZ4_setDictDecode (void* LZ4_streamDecode, const char* dictionary, int dictSize)
-{
-    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
-    lz4sd->dictionary = dictionary;
-    lz4sd->dictSize = dictSize;
-    return 1;
-}
-
-/*
-*_continue() :
-    These decoding functions allow decompression of multiple blocks in "streaming" mode.
-    Previously decoded blocks must still be available at the memory position where they were decoded.
-    If it's not possible, save the relevant part of decoded data into a safe buffer,
-    and indicate where it stands using LZ4_setDictDecode()
-*/
-int LZ4_decompress_safe_continue (void* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
-{
-    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
-    int result;
-
-    result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, lz4sd->dictionary, lz4sd->dictSize);
-    if (result <= 0) return result;
-    if (lz4sd->dictionary + lz4sd->dictSize == dest)
-    {
-        lz4sd->dictSize += result;
-    }
-    else
-    {
-        lz4sd->dictionary = dest;
-        lz4sd->dictSize = result;
-    }
-
-    return result;
-}
-
-int LZ4_decompress_fast_continue (void* LZ4_streamDecode, const char* source, char* dest, int originalSize)
-{
-    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
-    int result;
-
-    result = LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, lz4sd->dictionary, lz4sd->dictSize);
-    if (result <= 0) return result;
-    if (lz4sd->dictionary + lz4sd->dictSize == dest)
-    {
-        lz4sd->dictSize += result;
-    }
-    else
-    {
-        lz4sd->dictionary = dest;
-        lz4sd->dictSize = result;
-    }
-
-    return result;
-}
-
-
-/*
-Advanced decoding functions :
-*_usingDict() :
-    These decoding functions work the same as "_continue" ones,
-    the dictionary must be explicitly provided within parameters
-*/
-
-int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
-{
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, dictStart, dictSize);
-}
-
-int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
-{
-    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, dictStart, dictSize);
-}
-
-
-/***************************************************
-    Obsolete Functions
-***************************************************/
-/*
-These function names are deprecated and should no longer be used.
-They are only provided here for compatibility with older user programs.
-- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
-- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
-*/
-int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
-int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
-
-
-/* Obsolete Streaming functions */
-
-int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
-
-void LZ4_init(LZ4_stream_t_internal* lz4ds, const BYTE* base)
-{
-    MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE);
-    lz4ds->bufferStart = base;
-}
-
-int LZ4_resetStreamState(void* state, const char* inputBuffer)
-{
-    if ((((size_t)state) & 3) != 0) return 1;   /* Error : pointer is not aligned on 4-bytes boundary */
-    LZ4_init((LZ4_stream_t_internal*)state, (const BYTE*)inputBuffer);
-    return 0;
-}
-
-void* LZ4_create (const char* inputBuffer)
-{
-    void* lz4ds = ALLOCATOR(4, LZ4_STREAMSIZE_U32);
-    LZ4_init ((LZ4_stream_t_internal*)lz4ds, (const BYTE*)inputBuffer);
-    return lz4ds;
-}
-
-char* LZ4_slideInputBuffer (void* LZ4_Data)
-{
-    LZ4_stream_t_internal* lz4ds = (LZ4_stream_t_internal*)LZ4_Data;
-
-    LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)lz4ds->bufferStart, 64 KB);
-
-    return (char*)(lz4ds->bufferStart + 64 KB);
-}
-
-/*  Obsolete compresson functions using User-allocated state */
-
-int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
-
-int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize)
-{
-    if (((size_t)(state)&3) != 0) return 0;   /* Error : state is not aligned on 4-bytes boundary */
-    MEM_INIT(state, 0, LZ4_STREAMSIZE);
-
-    if (inputSize < (int)LZ4_64KLIMIT)
-        return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue);
-    else
-        return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue);
-}
-
-int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize)
-{
-    if (((size_t)(state)&3) != 0) return 0;   /* Error : state is not aligned on 4-bytes boundary */
-    MEM_INIT(state, 0, LZ4_STREAMSIZE);
-
-    if (inputSize < (int)LZ4_64KLIMIT)
-        return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue);
-    else
-        return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue);
-}
-
-/* Obsolete streaming decompression functions */
-
-int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
-{
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, NULL, 64 KB);
-}
-
-int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
-{
-    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, NULL, 64 KB);
-}
diff --git a/c-blosc/internal-complibs/lz4-r119/lz4.h b/c-blosc/internal-complibs/lz4-r119/lz4.h
deleted file mode 100644
index 1064fa115..000000000
--- a/c-blosc/internal-complibs/lz4-r119/lz4.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
-   LZ4 - Fast LZ compression algorithm
-   Header File
-   Copyright (C) 2011-2014, Yann Collet.
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - LZ4 source repository : http://code.google.com/p/lz4/
-   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
-*/
-#pragma once
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-
-/**************************************
-   Version
-**************************************/
-#define LZ4_VERSION_MAJOR    1    /* for major interface/format changes  */
-#define LZ4_VERSION_MINOR    2    /* for minor interface/format changes  */
-#define LZ4_VERSION_RELEASE  0    /* for tweaks, bug-fixes, or development */
-
-
-/**************************************
-   Tuning parameter
-**************************************/
-/*
- * LZ4_MEMORY_USAGE :
- * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
- * Increasing memory usage improves compression ratio
- * Reduced memory usage can improve speed, due to cache effect
- * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
- */
-#define LZ4_MEMORY_USAGE 14
-
-
-/**************************************
-   Simple Functions
-**************************************/
-
-int LZ4_compress        (const char* source, char* dest, int inputSize);
-int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxOutputSize);
-
-/*
-LZ4_compress() :
-    Compresses 'inputSize' bytes from 'source' into 'dest'.
-    Destination buffer must be already allocated,
-    and must be sized to handle worst cases situations (input data not compressible)
-    Worst case size evaluation is provided by function LZ4_compressBound()
-    inputSize : Max supported value is LZ4_MAX_INPUT_VALUE
-    return : the number of bytes written in buffer dest
-             or 0 if the compression fails
-
-LZ4_decompress_safe() :
-    compressedSize : is obviously the source size
-    maxOutputSize : is the size of the destination buffer, which must be already allocated.
-    return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
-             If the destination buffer is not large enough, decoding will stop and output an error code (<0).
-             If the source stream is detected malformed, the function will stop decoding and return a negative result.
-             This function is protected against buffer overflow exploits :
-             it never writes outside of output buffer, and never reads outside of input buffer.
-             Therefore, it is protected against malicious data packets.
-*/
-
-
-/*
-Note :
-    Should you prefer to explicitly allocate compression-table memory using your own allocation method,
-    use the streaming functions provided below, simply reset the memory area between each call to LZ4_compress_continue()
-*/
-
-
-/**************************************
-   Advanced Functions
-**************************************/
-#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
-#define LZ4_COMPRESSBOUND(isize)  ((unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
-
-/*
-LZ4_compressBound() :
-    Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible)
-    primarily useful for memory allocation of output buffer.
-    macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation).
-
-    isize  : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE
-    return : maximum output size in a "worst case" scenario
-             or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
-*/
-int LZ4_compressBound(int isize);
-
-
-/*
-LZ4_compress_limitedOutput() :
-    Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
-    If it cannot achieve it, compression will stop, and result of the function will be zero.
-    This function never writes outside of provided output buffer.
-
-    inputSize  : Max supported value is LZ4_MAX_INPUT_VALUE
-    maxOutputSize : is the size of the destination buffer (which must be already allocated)
-    return : the number of bytes written in buffer 'dest'
-             or 0 if the compression fails
-*/
-int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
-
-
-/*
-LZ4_decompress_fast() :
-    originalSize : is the original and therefore uncompressed size
-    return : the number of bytes read from the source buffer (in other words, the compressed size)
-             If the source stream is malformed, the function will stop decoding and return a negative result.
-             Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes.
-    note : This function is a bit faster than LZ4_decompress_safe()
-           It provides fast decompression and fully respect memory boundaries for properly formed compressed data.
-           It does not provide full protection against intentionnally modified data stream.
-           Use this function in a trusted environment (data to decode comes from a trusted source).
-*/
-int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
-
-
-/*
-LZ4_decompress_safe_partial() :
-    This function decompress a compressed block of size 'compressedSize' at position 'source'
-    into output buffer 'dest' of size 'maxOutputSize'.
-    The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
-    reducing decompression time.
-    return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
-       Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
-             Always control how many bytes were decoded.
-             If the source stream is detected malformed, the function will stop decoding and return a negative result.
-             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
-*/
-int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxOutputSize);
-
-
-/***********************************************
-   Experimental Streaming Compression Functions
-***********************************************/
-
-#define LZ4_STREAMSIZE_U32 ((1 << (LZ4_MEMORY_USAGE-2)) + 8)
-#define LZ4_STREAMSIZE     (LZ4_STREAMSIZE_U32 * sizeof(unsigned int))
-/*
- * LZ4_stream_t
- * information structure to track an LZ4 stream.
- * important : set this structure content to zero before first use !
- */
-typedef struct { unsigned int table[LZ4_STREAMSIZE_U32]; } LZ4_stream_t;
-
-/*
- * If you prefer dynamic allocation methods,
- * LZ4_createStream
- * provides a pointer (void*) towards an initialized LZ4_stream_t structure.
- * LZ4_free just frees it.
- */
-void* LZ4_createStream();
-int   LZ4_free (void* LZ4_stream);
-
-
-/*
- * LZ4_loadDict
- * Use this function to load a static dictionary into LZ4_stream.
- * Any previous data will be forgotten, only 'dictionary' will remain in memory.
- * Loading a size of 0 is allowed (same effect as init).
- * Return : 1 if OK, 0 if error
- */
-int LZ4_loadDict (void* LZ4_stream, const char* dictionary, int dictSize);
-
-/*
- * LZ4_compress_continue
- * Compress data block 'source', using blocks compressed before as dictionary to improve compression ratio
- * Previous data blocks are assumed to still be present at their previous location.
- */
-int LZ4_compress_continue (void* LZ4_stream, const char* source, char* dest, int inputSize);
-
-/*
- * LZ4_compress_limitedOutput_continue
- * Same as before, but also specify a maximum target compressed size (maxOutputSize)
- * If objective cannot be met, compression exits, and returns a zero.
- */
-int LZ4_compress_limitedOutput_continue (void* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize);
-
-/*
- * LZ4_saveDict
- * If previously compressed data block is not guaranteed to remain at its previous memory location
- * save it into a safe place (char* safeBuffer)
- * Note : you don't need to call LZ4_loadDict() afterwards,
- *        dictionary is immediately usable, you can therefore call again LZ4_compress_continue()
- * Return : 1 if OK, 0 if error
- * Note : any dictSize > 64 KB will be interpreted as 64KB.
- */
-int LZ4_saveDict (void* LZ4_stream, char* safeBuffer, int dictSize);
-
-
-/************************************************
-  Experimental Streaming Decompression Functions
-************************************************/
-
-#define LZ4_STREAMDECODESIZE_U32 4
-#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U32 * sizeof(unsigned int))
-/*
- * LZ4_streamDecode_t
- * information structure to track an LZ4 stream.
- * important : set this structure content to zero before first use !
- */
-typedef struct { unsigned int table[LZ4_STREAMDECODESIZE_U32]; } LZ4_streamDecode_t;
-
-/*
- * If you prefer dynamic allocation methods,
- * LZ4_createStreamDecode()
- * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
- * LZ4_free just frees it.
- */
-void* LZ4_createStreamDecode();
-int   LZ4_free (void* LZ4_stream);   /* yes, it's the same one as for compression */
-
-/*
-*_continue() :
-    These decoding functions allow decompression of multiple blocks in "streaming" mode.
-    Previously decoded blocks must still be available at the memory position where they were decoded.
-    If it's not possible, save the relevant part of decoded data into a safe buffer,
-    and indicate where it stands using LZ4_setDictDecode()
-*/
-int LZ4_decompress_safe_continue (void* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize);
-int LZ4_decompress_fast_continue (void* LZ4_streamDecode, const char* source, char* dest, int originalSize);
-
-/*
- * LZ4_setDictDecode
- * Use this function to instruct where to find the dictionary.
- * This function can be used to specify a static dictionary,
- * or to instruct where to find some previously decoded data saved into a different memory space.
- * Setting a size of 0 is allowed (same effect as no dictionary).
- * Return : 1 if OK, 0 if error
- */
-int LZ4_setDictDecode (void* LZ4_streamDecode, const char* dictionary, int dictSize);
-
-
-/*
-Advanced decoding functions :
-*_usingDict() :
-    These decoding functions work the same as
-    a combination of LZ4_setDictDecode() followed by LZ4_decompress_x_continue()
-    all together into a single function call.
-    It doesn't use nor update an LZ4_streamDecode_t structure.
-*/
-int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize);
-int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
-
-
-
-
-/**************************************
-   Obsolete Functions
-**************************************/
-/*
-Obsolete decompression functions
-These function names are deprecated and should no longer be used.
-They are only provided here for compatibility with older user programs.
-- LZ4_uncompress is the same as LZ4_decompress_fast
-- LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
-*/
-int LZ4_uncompress (const char* source, char* dest, int outputSize);
-int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize);
-
-/* Obsolete functions for externally allocated state; use streaming interface instead */
-int LZ4_sizeofState(void);
-int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
-int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
-
-/* Obsolete streaming functions; use new streaming interface whenever possible */
-void* LZ4_create (const char* inputBuffer);
-int   LZ4_sizeofStreamState(void);
-int   LZ4_resetStreamState(void* state, const char* inputBuffer);
-char* LZ4_slideInputBuffer (void* state);
-
-/* Obsolete streaming decoding functions */
-int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int compressedSize, int maxOutputSize);
-int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int originalSize);
-
-
-#if defined (__cplusplus)
-}
-#endif
diff --git a/c-blosc/internal-complibs/lz4-r119/lz4hc.c b/c-blosc/internal-complibs/lz4-r119/lz4hc.c
deleted file mode 100644
index 608674902..000000000
--- a/c-blosc/internal-complibs/lz4-r119/lz4hc.c
+++ /dev/null
@@ -1,892 +0,0 @@
-/*
-   LZ4 HC - High Compression Mode of LZ4
-   Copyright (C) 2011-2014, Yann Collet.
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
-   - LZ4 source repository : http://code.google.com/p/lz4/
-*/
-
-
-
-/**************************************
-   Tuning Parameter
-**************************************/
-#define LZ4HC_DEFAULT_COMPRESSIONLEVEL 8
-
-
-/**************************************
-   Memory routines
-**************************************/
-#include <stdlib.h>   /* calloc, free */
-#define ALLOCATOR(s)  calloc(1,s)
-#define FREEMEM       free
-#include <string.h>   /* memset, memcpy */
-#define MEM_INIT      memset
-
-
-/**************************************
-   CPU Feature Detection
-**************************************/
-/* 32 or 64 bits ? */
-#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
-  || defined(__powerpc64__) || defined(__powerpc64le__) \
-  || defined(__ppc64__) || defined(__ppc64le__) \
-  || defined(__PPC64__) || defined(__PPC64LE__) \
-  || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) )   /* Detects 64 bits mode */
-#  define LZ4_ARCH64 1
-#else
-#  define LZ4_ARCH64 0
-#endif
-
-/*
- * Little Endian or Big Endian ?
- * Overwrite the #define below if you know your architecture endianess
- */
-#include <stdlib.h>   /* Apparently required to detect endianess */
-#if defined (__GLIBC__)
-#  include <endian.h>
-#  if (__BYTE_ORDER == __BIG_ENDIAN)
-#     define LZ4_BIG_ENDIAN 1
-#  endif
-#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
-#  define LZ4_BIG_ENDIAN 1
-#elif defined(__sparc) || defined(__sparc__) \
-   || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
-   || defined(__hpux)  || defined(__hppa) \
-   || defined(_MIPSEB) || defined(__s390__)
-#  define LZ4_BIG_ENDIAN 1
-#else
-/* Little Endian assumed. PDP Endian and other very rare endian format are unsupported. */
-#endif
-
-/*
- * Unaligned memory access is automatically enabled for "common" CPU, such as x86.
- * For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected
- * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance
- */
-#if defined(__ARM_FEATURE_UNALIGNED)
-#  define LZ4_FORCE_UNALIGNED_ACCESS 1
-#endif
-
-/* Define this parameter if your target system or compiler does not support hardware bit count */
-#if defined(_MSC_VER) && defined(_WIN32_WCE)            /* Visual Studio for Windows CE does not support Hardware bit count */
-#  define LZ4_FORCE_SW_BITCOUNT
-#endif
-
-
-/**************************************
- Compiler Options
-**************************************/
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
-/* "restrict" is a known keyword */
-#else
-#  define restrict /* Disable restrict */
-#endif
-
-#ifdef _MSC_VER    /* Visual Studio */
-#  define FORCE_INLINE static __forceinline
-#  include <intrin.h>                    /* For Visual 2005 */
-#  if LZ4_ARCH64   /* 64-bits */
-#    pragma intrinsic(_BitScanForward64) /* For Visual 2005 */
-#    pragma intrinsic(_BitScanReverse64) /* For Visual 2005 */
-#  else            /* 32-bits */
-#    pragma intrinsic(_BitScanForward)   /* For Visual 2005 */
-#    pragma intrinsic(_BitScanReverse)   /* For Visual 2005 */
-#  endif
-#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
-#  pragma warning(disable : 4701)        /* disable: C4701: potentially uninitialized local variable used */
-#else
-#  ifdef __GNUC__
-#    define FORCE_INLINE static inline __attribute__((always_inline))
-#  else
-#    define FORCE_INLINE static inline
-#  endif
-#endif
-
-#ifdef _MSC_VER  /* Visual Studio */
-#  define lz4_bswap16(x) _byteswap_ushort(x)
-#else
-#  define lz4_bswap16(x)  ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)))
-#endif
-
-
-/**************************************
-   Includes
-**************************************/
-#include "lz4hc.h"
-#include "lz4.h"
-
-
-/**************************************
-   Basic Types
-**************************************/
-#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
-# include <stdint.h>
-  typedef uint8_t  BYTE;
-  typedef uint16_t U16;
-  typedef uint32_t U32;
-  typedef  int32_t S32;
-  typedef uint64_t U64;
-#else
-  typedef unsigned char       BYTE;
-  typedef unsigned short      U16;
-  typedef unsigned int        U32;
-  typedef   signed int        S32;
-  typedef unsigned long long  U64;
-#endif
-
-#if defined(__GNUC__)  && !defined(LZ4_FORCE_UNALIGNED_ACCESS)
-#  define _PACKED __attribute__ ((packed))
-#else
-#  define _PACKED
-#endif
-
-#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
-#  ifdef __IBMC__
-#    pragma pack(1)
-#  else
-#    pragma pack(push, 1)
-#  endif
-#endif
-
-typedef struct _U16_S { U16 v; } _PACKED U16_S;
-typedef struct _U32_S { U32 v; } _PACKED U32_S;
-typedef struct _U64_S { U64 v; } _PACKED U64_S;
-
-#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
-#  pragma pack(pop)
-#endif
-
-#define A64(x) (((U64_S *)(x))->v)
-#define A32(x) (((U32_S *)(x))->v)
-#define A16(x) (((U16_S *)(x))->v)
-
-
-/**************************************
-   Constants
-**************************************/
-#define MINMATCH 4
-
-#define DICTIONARY_LOGSIZE 16
-#define MAXD (1<<DICTIONARY_LOGSIZE)
-#define MAXD_MASK ((U32)(MAXD - 1))
-#define MAX_DISTANCE (MAXD - 1)
-
-#define HASH_LOG (DICTIONARY_LOGSIZE-1)
-#define HASHTABLESIZE (1 << HASH_LOG)
-#define HASH_MASK (HASHTABLESIZE - 1)
-
-#define ML_BITS  4
-#define ML_MASK  (size_t)((1U<<ML_BITS)-1)
-#define RUN_BITS (8-ML_BITS)
-#define RUN_MASK ((1U<<RUN_BITS)-1)
-
-#define COPYLENGTH 8
-#define LASTLITERALS 5
-#define MFLIMIT (COPYLENGTH+MINMATCH)
-#define MINLENGTH (MFLIMIT+1)
-#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH)
-
-#define KB *(1U<<10)
-#define MB *(1U<<20)
-#define GB *(1U<<30)
-
-
-/**************************************
-   Architecture-specific macros
-**************************************/
-#if LZ4_ARCH64   /* 64-bit */
-#  define STEPSIZE 8
-#  define LZ4_COPYSTEP(s,d)     A64(d) = A64(s); d+=8; s+=8;
-#  define LZ4_COPYPACKET(s,d)   LZ4_COPYSTEP(s,d)
-#  define AARCH A64
-#  define HTYPE                 U32
-#  define INITBASE(b,s)         const BYTE* const b = s
-#else            /* 32-bit */
-#  define STEPSIZE 4
-#  define LZ4_COPYSTEP(s,d)     A32(d) = A32(s); d+=4; s+=4;
-#  define LZ4_COPYPACKET(s,d)   LZ4_COPYSTEP(s,d); LZ4_COPYSTEP(s,d);
-#  define AARCH A32
-#  define HTYPE                 U32
-#  define INITBASE(b,s)         const BYTE* const b = s
-#endif
-
-#if defined(LZ4_BIG_ENDIAN)
-#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
-#  define LZ4_WRITE_LITTLEENDIAN_16(p,i)  { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p+=2; }
-#else      /* Little Endian */
-#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = (s) - A16(p); }
-#  define LZ4_WRITE_LITTLEENDIAN_16(p,v)  { A16(p) = v; p+=2; }
-#endif
-
-
-/**************************************
-   Local Types
-**************************************/
-typedef struct
-{
-    const BYTE* inputBuffer;
-    const BYTE* base;
-    const BYTE* end;
-    HTYPE hashTable[HASHTABLESIZE];
-    U16 chainTable[MAXD];
-    const BYTE* nextToUpdate;
-} LZ4HC_Data_Structure;
-
-
-/**************************************
-   Macros
-**************************************/
-#define LZ4_WILDCOPY(s,d,e)    do { LZ4_COPYPACKET(s,d) } while (d<e);
-#define LZ4_BLINDCOPY(s,d,l)   { BYTE* e=d+l; LZ4_WILDCOPY(s,d,e); d=e; }
-#define HASH_FUNCTION(i)       (((i) * 2654435761U) >> ((MINMATCH*8)-HASH_LOG))
-#define HASH_VALUE(p)          HASH_FUNCTION(A32(p))
-#define HASH_POINTER(p)        (HashTable[HASH_VALUE(p)] + base)
-#define DELTANEXT(p)           chainTable[(size_t)(p) & MAXD_MASK]
-#define GETNEXT(p)             ((p) - (size_t)DELTANEXT(p))
-
-
-/**************************************
- Private functions
-**************************************/
-#if LZ4_ARCH64
-
-FORCE_INLINE int LZ4_NbCommonBytes (register U64 val)
-{
-#if defined(LZ4_BIG_ENDIAN)
-#  if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    unsigned long r = 0;
-    _BitScanReverse64( &r, val );
-    return (int)(r>>3);
-#  elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    return (__builtin_clzll(val) >> 3);
-#  else
-    int r;
-    if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
-    if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-    r += (!val);
-    return r;
-#  endif
-#else
-#  if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    unsigned long r = 0;
-    _BitScanForward64( &r, val );
-    return (int)(r>>3);
-#  elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    return (__builtin_ctzll(val) >> 3);
-#  else
-    static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
-    return DeBruijnBytePos[((U64)((val & -val) * 0x0218A392CDABBD3F)) >> 58];
-#  endif
-#endif
-}
-
-#else
-
-FORCE_INLINE int LZ4_NbCommonBytes (register U32 val)
-{
-#if defined(LZ4_BIG_ENDIAN)
-#  if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    unsigned long r;
-    _BitScanReverse( &r, val );
-    return (int)(r>>3);
-#  elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    return (__builtin_clz(val) >> 3);
-#  else
-    int r;
-    if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
-    r += (!val);
-    return r;
-#  endif
-#else
-#  if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    unsigned long r;
-    _BitScanForward( &r, val );
-    return (int)(r>>3);
-#  elif defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
-    return (__builtin_ctz(val) >> 3);
-#  else
-    static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
-    return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#  endif
-#endif
-}
-
-#endif
-
-
-int LZ4_sizeofStreamStateHC()
-{
-    return sizeof(LZ4HC_Data_Structure);
-}
-
-FORCE_INLINE void LZ4_initHC (LZ4HC_Data_Structure* hc4, const BYTE* base)
-{
-    MEM_INIT((void*)hc4->hashTable, 0, sizeof(hc4->hashTable));
-    MEM_INIT(hc4->chainTable, 0xFF, sizeof(hc4->chainTable));
-    hc4->nextToUpdate = base + 1;
-    hc4->base = base;
-    hc4->inputBuffer = base;
-    hc4->end = base;
-}
-
-int LZ4_resetStreamStateHC(void* state, const char* inputBuffer)
-{
-    if ((((size_t)state) & (sizeof(void*)-1)) != 0) return 1;   /* Error : pointer is not aligned for pointer (32 or 64 bits) */
-    LZ4_initHC((LZ4HC_Data_Structure*)state, (const BYTE*)inputBuffer);
-    return 0;
-}
-
-
-void* LZ4_createHC (const char* inputBuffer)
-{
-    void* hc4 = ALLOCATOR(sizeof(LZ4HC_Data_Structure));
-    LZ4_initHC ((LZ4HC_Data_Structure*)hc4, (const BYTE*)inputBuffer);
-    return hc4;
-}
-
-
-int LZ4_freeHC (void* LZ4HC_Data)
-{
-    FREEMEM(LZ4HC_Data);
-    return (0);
-}
-
-
-/* Update chains up to ip (excluded) */
-FORCE_INLINE void LZ4HC_Insert (LZ4HC_Data_Structure* hc4, const BYTE* ip)
-{
-    U16*   chainTable = hc4->chainTable;
-    HTYPE* HashTable  = hc4->hashTable;
-    INITBASE(base,hc4->base);
-
-    while(hc4->nextToUpdate < ip)
-    {
-        const BYTE* const p = hc4->nextToUpdate;
-        size_t delta = (p) - HASH_POINTER(p);
-        if (delta>MAX_DISTANCE) delta = MAX_DISTANCE;
-        DELTANEXT(p) = (U16)delta;
-        HashTable[HASH_VALUE(p)] = (HTYPE)((p) - base);
-        hc4->nextToUpdate++;
-    }
-}
-
-
-char* LZ4_slideInputBufferHC(void* LZ4HC_Data)
-{
-    LZ4HC_Data_Structure* hc4 = (LZ4HC_Data_Structure*)LZ4HC_Data;
-    U32 distance = (U32)(hc4->end - hc4->inputBuffer) - 64 KB;
-    distance = (distance >> 16) << 16;   /* Must be a multiple of 64 KB */
-    LZ4HC_Insert(hc4, hc4->end - MINMATCH);
-    memcpy((void*)(hc4->end - 64 KB - distance), (const void*)(hc4->end - 64 KB), 64 KB);
-    hc4->nextToUpdate -= distance;
-    hc4->base -= distance;
-    if ((U32)(hc4->inputBuffer - hc4->base) > 1 GB + 64 KB)   /* Avoid overflow */
-    {
-        int i;
-        hc4->base += 1 GB;
-        for (i=0; i<HASHTABLESIZE; i++) hc4->hashTable[i] -= 1 GB;
-    }
-    hc4->end -= distance;
-    return (char*)(hc4->end);
-}
-
-
-FORCE_INLINE size_t LZ4HC_CommonLength (const BYTE* p1, const BYTE* p2, const BYTE* const matchlimit)
-{
-    const BYTE* p1t = p1;
-
-    while (p1t<matchlimit-(STEPSIZE-1))
-    {
-        size_t diff = AARCH(p2) ^ AARCH(p1t);
-        if (!diff) { p1t+=STEPSIZE; p2+=STEPSIZE; continue; }
-        p1t += LZ4_NbCommonBytes(diff);
-        return (p1t - p1);
-    }
-    if (LZ4_ARCH64) if ((p1t<(matchlimit-3)) && (A32(p2) == A32(p1t))) { p1t+=4; p2+=4; }
-    if ((p1t<(matchlimit-1)) && (A16(p2) == A16(p1t))) { p1t+=2; p2+=2; }
-    if ((p1t<matchlimit) && (*p2 == *p1t)) p1t++;
-    return (p1t - p1);
-}
-
-
-FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4, const BYTE* ip, const BYTE* const matchlimit, const BYTE** matchpos, const int maxNbAttempts)
-{
-    U16* const chainTable = hc4->chainTable;
-    HTYPE* const HashTable = hc4->hashTable;
-    const BYTE* ref;
-    INITBASE(base,hc4->base);
-    int nbAttempts=maxNbAttempts;
-    size_t repl=0, ml=0;
-    U16 delta=0;  /* useless assignment, to remove an uninitialization warning */
-
-    /* HC4 match finder */
-    LZ4HC_Insert(hc4, ip);
-    ref = HASH_POINTER(ip);
-
-#define REPEAT_OPTIMIZATION
-#ifdef REPEAT_OPTIMIZATION
-    /* Detect repetitive sequences of length <= 4 */
-    if ((U32)(ip-ref) <= 4)        /* potential repetition */
-    {
-        if (A32(ref) == A32(ip))   /* confirmed */
-        {
-            delta = (U16)(ip-ref);
-            repl = ml  = LZ4HC_CommonLength(ip+MINMATCH, ref+MINMATCH, matchlimit) + MINMATCH;
-            *matchpos = ref;
-        }
-        ref = GETNEXT(ref);
-    }
-#endif
-
-    while (((U32)(ip-ref) <= MAX_DISTANCE) && (nbAttempts))
-    {
-        nbAttempts--;
-        if (*(ref+ml) == *(ip+ml))
-        if (A32(ref) == A32(ip))
-        {
-            size_t mlt = LZ4HC_CommonLength(ip+MINMATCH, ref+MINMATCH, matchlimit) + MINMATCH;
-            if (mlt > ml) { ml = mlt; *matchpos = ref; }
-        }
-        ref = GETNEXT(ref);
-    }
-
-#ifdef REPEAT_OPTIMIZATION
-    /* Complete table */
-    if (repl)
-    {
-        const BYTE* ptr = ip;
-        const BYTE* end;
-
-        end = ip + repl - (MINMATCH-1);
-        while(ptr < end-delta)
-        {
-            DELTANEXT(ptr) = delta;    /* Pre-Load */
-            ptr++;
-        }
-        do
-        {
-            DELTANEXT(ptr) = delta;
-            HashTable[HASH_VALUE(ptr)] = (HTYPE)((ptr) - base);     /* Head of chain */
-            ptr++;
-        } while(ptr < end);
-        hc4->nextToUpdate = end;
-    }
-#endif
-
-    return (int)ml;
-}
-
-
-FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch (LZ4HC_Data_Structure* hc4, const BYTE* ip, const BYTE* startLimit, const BYTE* matchlimit, int longest, const BYTE** matchpos, const BYTE** startpos, const int maxNbAttempts)
-{
-    U16* const  chainTable = hc4->chainTable;
-    HTYPE* const HashTable = hc4->hashTable;
-    INITBASE(base,hc4->base);
-    const BYTE*  ref;
-    int nbAttempts = maxNbAttempts;
-    int delta = (int)(ip-startLimit);
-
-    /* First Match */
-    LZ4HC_Insert(hc4, ip);
-    ref = HASH_POINTER(ip);
-
-    while (((U32)(ip-ref) <= MAX_DISTANCE) && (nbAttempts))
-    {
-        nbAttempts--;
-        if (*(startLimit + longest) == *(ref - delta + longest))
-        if (A32(ref) == A32(ip))
-        {
-#if 1
-            const BYTE* reft = ref+MINMATCH;
-            const BYTE* ipt = ip+MINMATCH;
-            const BYTE* startt = ip;
-
-            while (ipt<matchlimit-(STEPSIZE-1))
-            {
-                size_t diff = AARCH(reft) ^ AARCH(ipt);
-                if (!diff) { ipt+=STEPSIZE; reft+=STEPSIZE; continue; }
-                ipt += LZ4_NbCommonBytes(diff);
-                goto _endCount;
-            }
-            if (LZ4_ARCH64) if ((ipt<(matchlimit-3)) && (A32(reft) == A32(ipt))) { ipt+=4; reft+=4; }
-            if ((ipt<(matchlimit-1)) && (A16(reft) == A16(ipt))) { ipt+=2; reft+=2; }
-            if ((ipt<matchlimit) && (*reft == *ipt)) ipt++;
-_endCount:
-            reft = ref;
-#else
-            /* Easier for code maintenance, but unfortunately slower too */
-            const BYTE* startt = ip;
-            const BYTE* reft = ref;
-            const BYTE* ipt = ip + MINMATCH + LZ4HC_CommonLength(ip+MINMATCH, ref+MINMATCH, matchlimit);
-#endif
-
-            while ((startt>startLimit) && (reft > hc4->inputBuffer) && (startt[-1] == reft[-1])) {startt--; reft--;}
-
-            if ((ipt-startt) > longest)
-            {
-                longest = (int)(ipt-startt);
-                *matchpos = reft;
-                *startpos = startt;
-            }
-        }
-        ref = GETNEXT(ref);
-    }
-
-    return longest;
-}
-
-
-typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
-
-FORCE_INLINE int LZ4HC_encodeSequence (
-                       const BYTE** ip,
-                       BYTE** op,
-                       const BYTE** anchor,
-                       int matchLength,
-                       const BYTE* ref,
-                       limitedOutput_directive limitedOutputBuffer,
-                       BYTE* oend)
-{
-    int length;
-    BYTE* token;
-
-    /* Encode Literal length */
-    length = (int)(*ip - *anchor);
-    token = (*op)++;
-    if ((limitedOutputBuffer) && ((*op + length + (2 + 1 + LASTLITERALS) + (length>>8)) > oend)) return 1;   /* Check output limit */
-    if (length>=(int)RUN_MASK) { int len; *token=(RUN_MASK<<ML_BITS); len = length-RUN_MASK; for(; len > 254 ; len-=255) *(*op)++ = 255;  *(*op)++ = (BYTE)len; }
-    else *token = (BYTE)(length<<ML_BITS);
-
-    /* Copy Literals */
-    LZ4_BLINDCOPY(*anchor, *op, length);
-
-    /* Encode Offset */
-    LZ4_WRITE_LITTLEENDIAN_16(*op,(U16)(*ip-ref));
-
-    /* Encode MatchLength */
-    length = (int)(matchLength-MINMATCH);
-    if ((limitedOutputBuffer) && (*op + (1 + LASTLITERALS) + (length>>8) > oend)) return 1;   /* Check output limit */
-    if (length>=(int)ML_MASK) { *token+=ML_MASK; length-=ML_MASK; for(; length > 509 ; length-=510) { *(*op)++ = 255; *(*op)++ = 255; } if (length > 254) { length-=255; *(*op)++ = 255; } *(*op)++ = (BYTE)length; }
-    else *token += (BYTE)(length);
-
-    /* Prepare next loop */
-    *ip += matchLength;
-    *anchor = *ip;
-
-    return 0;
-}
-
-
-#define MAX_COMPRESSION_LEVEL 16
-static int LZ4HC_compress_generic (
-                 void* ctxvoid,
-                 const char* source,
-                 char* dest,
-                 int inputSize,
-                 int maxOutputSize,
-                 int compressionLevel,
-                 limitedOutput_directive limit
-                )
-{
-    LZ4HC_Data_Structure* ctx = (LZ4HC_Data_Structure*) ctxvoid;
-    const BYTE* ip = (const BYTE*) source;
-    const BYTE* anchor = ip;
-    const BYTE* const iend = ip + inputSize;
-    const BYTE* const mflimit = iend - MFLIMIT;
-    const BYTE* const matchlimit = (iend - LASTLITERALS);
-
-    BYTE* op = (BYTE*) dest;
-    BYTE* const oend = op + maxOutputSize;
-
-    const int maxNbAttempts = compressionLevel > MAX_COMPRESSION_LEVEL ? 1 << MAX_COMPRESSION_LEVEL : compressionLevel ? 1<<(compressionLevel-1) : 1<<LZ4HC_DEFAULT_COMPRESSIONLEVEL;
-    int   ml, ml2, ml3, ml0;
-    const BYTE* ref=NULL;
-    const BYTE* start2=NULL;
-    const BYTE* ref2=NULL;
-    const BYTE* start3=NULL;
-    const BYTE* ref3=NULL;
-    const BYTE* start0;
-    const BYTE* ref0;
-
-
-    /* Ensure blocks follow each other */
-    if (ip != ctx->end) return 0;
-    ctx->end += inputSize;
-
-    ip++;
-
-    /* Main Loop */
-    while (ip < mflimit)
-    {
-        ml = LZ4HC_InsertAndFindBestMatch (ctx, ip, matchlimit, (&ref), maxNbAttempts);
-        if (!ml) { ip++; continue; }
-
-        /* saved, in case we would skip too much */
-        start0 = ip;
-        ref0 = ref;
-        ml0 = ml;
-
-_Search2:
-        if (ip+ml < mflimit)
-            ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 1, matchlimit, ml, &ref2, &start2, maxNbAttempts);
-        else ml2 = ml;
-
-        if (ml2 == ml)  /* No better match */
-        {
-            if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0;
-            continue;
-        }
-
-        if (start0 < ip)
-        {
-            if (start2 < ip + ml0)   /* empirical */
-            {
-                ip = start0;
-                ref = ref0;
-                ml = ml0;
-            }
-        }
-
-        /* Here, start0==ip */
-        if ((start2 - ip) < 3)   /* First Match too small : removed */
-        {
-            ml = ml2;
-            ip = start2;
-            ref =ref2;
-            goto _Search2;
-        }
-
-_Search3:
-        /*
-         * Currently we have :
-         * ml2 > ml1, and
-         * ip1+3 <= ip2 (usually < ip1+ml1)
-         */
-        if ((start2 - ip) < OPTIMAL_ML)
-        {
-            int correction;
-            int new_ml = ml;
-            if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML;
-            if (ip+new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
-            correction = new_ml - (int)(start2 - ip);
-            if (correction > 0)
-            {
-                start2 += correction;
-                ref2 += correction;
-                ml2 -= correction;
-            }
-        }
-        /* Now, we have start2 = ip+new_ml, with new_ml = min(ml, OPTIMAL_ML=18) */
-
-        if (start2 + ml2 < mflimit)
-            ml3 = LZ4HC_InsertAndGetWiderMatch(ctx, start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3, maxNbAttempts);
-        else ml3 = ml2;
-
-        if (ml3 == ml2) /* No better match : 2 sequences to encode */
-        {
-            /* ip & ref are known; Now for ml */
-            if (start2 < ip+ml)  ml = (int)(start2 - ip);
-            /* Now, encode 2 sequences */
-            if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0;
-            ip = start2;
-            if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml2, ref2, limit, oend)) return 0;
-            continue;
-        }
-
-        if (start3 < ip+ml+3) /* Not enough space for match 2 : remove it */
-        {
-            if (start3 >= (ip+ml)) /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */
-            {
-                if (start2 < ip+ml)
-                {
-                    int correction = (int)(ip+ml - start2);
-                    start2 += correction;
-                    ref2 += correction;
-                    ml2 -= correction;
-                    if (ml2 < MINMATCH)
-                    {
-                        start2 = start3;
-                        ref2 = ref3;
-                        ml2 = ml3;
-                    }
-                }
-
-                if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0;
-                ip  = start3;
-                ref = ref3;
-                ml  = ml3;
-
-                start0 = start2;
-                ref0 = ref2;
-                ml0 = ml2;
-                goto _Search2;
-            }
-
-            start2 = start3;
-            ref2 = ref3;
-            ml2 = ml3;
-            goto _Search3;
-        }
-
-        /*
-         * OK, now we have 3 ascending matches; let's write at least the first one
-         * ip & ref are known; Now for ml
-         */
-        if (start2 < ip+ml)
-        {
-            if ((start2 - ip) < (int)ML_MASK)
-            {
-                int correction;
-                if (ml > OPTIMAL_ML) ml = OPTIMAL_ML;
-                if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH;
-                correction = ml - (int)(start2 - ip);
-                if (correction > 0)
-                {
-                    start2 += correction;
-                    ref2 += correction;
-                    ml2 -= correction;
-                }
-            }
-            else
-            {
-                ml = (int)(start2 - ip);
-            }
-        }
-        if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0;
-
-        ip = start2;
-        ref = ref2;
-        ml = ml2;
-
-        start2 = start3;
-        ref2 = ref3;
-        ml2 = ml3;
-
-        goto _Search3;
-
-    }
-
-    /* Encode Last Literals */
-    {
-        int lastRun = (int)(iend - anchor);
-        if ((limit) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0;  /* Check output limit */
-        if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
-        else *op++ = (BYTE)(lastRun<<ML_BITS);
-        memcpy(op, anchor, iend - anchor);
-        op += iend-anchor;
-    }
-
-    /* End */
-    return (int) (((char*)op)-dest);
-}
-
-
-int LZ4_compressHC2(const char* source, char* dest, int inputSize, int compressionLevel)
-{
-    void* ctx = LZ4_createHC(source);
-    int result;
-    if (ctx==NULL) return 0;
-
-    result = LZ4HC_compress_generic (ctx, source, dest, inputSize, 0, compressionLevel, noLimit);
-
-    LZ4_freeHC(ctx);
-    return result;
-}
-
-int LZ4_compressHC(const char* source, char* dest, int inputSize) { return LZ4_compressHC2(source, dest, inputSize, 0); }
-
-int LZ4_compressHC2_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel)
-{
-    void* ctx = LZ4_createHC(source);
-    int result;
-    if (ctx==NULL) return 0;
-
-    result = LZ4HC_compress_generic (ctx, source, dest, inputSize, maxOutputSize, compressionLevel, limitedOutput);
-
-    LZ4_freeHC(ctx);
-    return result;
-}
-
-int LZ4_compressHC_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
-{
-    return LZ4_compressHC2_limitedOutput(source, dest, inputSize, maxOutputSize, 0);
-}
-
-
-/*****************************
-   Using external allocation
-*****************************/
-int LZ4_sizeofStateHC() { return sizeof(LZ4HC_Data_Structure); }
-
-
-int LZ4_compressHC2_withStateHC (void* state, const char* source, char* dest, int inputSize, int compressionLevel)
-{
-    if (((size_t)(state)&(sizeof(void*)-1)) != 0) return 0;   /* Error : state is not aligned for pointers (32 or 64 bits) */
-    LZ4_initHC ((LZ4HC_Data_Structure*)state, (const BYTE*)source);
-    return LZ4HC_compress_generic (state, source, dest, inputSize, 0, compressionLevel, noLimit);
-}
-
-int LZ4_compressHC_withStateHC (void* state, const char* source, char* dest, int inputSize)
-{ return LZ4_compressHC2_withStateHC (state, source, dest, inputSize, 0); }
-
-
-int LZ4_compressHC2_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel)
-{
-    if (((size_t)(state)&(sizeof(void*)-1)) != 0) return 0;   /* Error : state is not aligned for pointers (32 or 64 bits) */
-    LZ4_initHC ((LZ4HC_Data_Structure*)state, (const BYTE*)source);
-    return LZ4HC_compress_generic (state, source, dest, inputSize, maxOutputSize, compressionLevel, limitedOutput);
-}
-
-int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize)
-{ return LZ4_compressHC2_limitedOutput_withStateHC (state, source, dest, inputSize, maxOutputSize, 0); }
-
-
-/****************************
-   Stream functions
-****************************/
-
-int LZ4_compressHC_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize)
-{
-    return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, 0, 0, noLimit);
-}
-
-int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel)
-{
-    return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, 0, compressionLevel, noLimit);
-}
-
-int LZ4_compressHC_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize)
-{
-    return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, maxOutputSize, 0, limitedOutput);
-}
-
-int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel)
-{
-    return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, maxOutputSize, compressionLevel, limitedOutput);
-}
diff --git a/c-blosc/internal-complibs/lz4-r119/lz4hc.h b/c-blosc/internal-complibs/lz4-r119/lz4hc.h
deleted file mode 100644
index deb239406..000000000
--- a/c-blosc/internal-complibs/lz4-r119/lz4hc.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
-   LZ4 HC - High Compression Mode of LZ4
-   Header File
-   Copyright (C) 2011-2014, Yann Collet.
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
-   - LZ4 source repository : http://code.google.com/p/lz4/
-*/
-#pragma once
-
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-
-int LZ4_compressHC (const char* source, char* dest, int inputSize);
-/*
-LZ4_compressHC :
-    return : the number of bytes in compressed buffer dest
-             or 0 if compression fails.
-    note : destination buffer must be already allocated.
-        To avoid any problem, size it to handle worst cases situations (input data not compressible)
-        Worst case size evaluation is provided by function LZ4_compressBound() (see "lz4.h")
-*/
-
-int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
-/*
-LZ4_compress_limitedOutput() :
-    Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
-    If it cannot achieve it, compression will stop, and result of the function will be zero.
-    This function never writes outside of provided output buffer.
-
-    inputSize  : Max supported value is 1 GB
-    maxOutputSize : is maximum allowed size into the destination buffer (which must be already allocated)
-    return : the number of output bytes written in buffer 'dest'
-             or 0 if compression fails.
-*/
-
-
-int LZ4_compressHC2 (const char* source, char* dest, int inputSize, int compressionLevel);
-int LZ4_compressHC2_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
-/*
-    Same functions as above, but with programmable 'compressionLevel'.
-    Recommended values are between 4 and 9, although any value between 0 and 16 will work.
-    'compressionLevel'==0 means use default 'compressionLevel' value.
-    Values above 16 behave the same as 16.
-    Equivalent variants exist for all other compression functions below.
-*/
-
-/* Note :
-Decompression functions are provided within LZ4 source code (see "lz4.h") (BSD license)
-*/
-
-
-/**************************************
-   Using an external allocation
-**************************************/
-int LZ4_sizeofStateHC(void);
-int LZ4_compressHC_withStateHC               (void* state, const char* source, char* dest, int inputSize);
-int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
-
-int LZ4_compressHC2_withStateHC              (void* state, const char* source, char* dest, int inputSize, int compressionLevel);
-int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
-
-/*
-These functions are provided should you prefer to allocate memory for compression tables with your own allocation methods.
-To know how much memory must be allocated for the compression tables, use :
-int LZ4_sizeofStateHC();
-
-Note that tables must be aligned for pointer (32 or 64 bits), otherwise compression will fail (return code 0).
-
-The allocated memory can be provided to the compressions functions using 'void* state' parameter.
-LZ4_compress_withStateHC() and LZ4_compress_limitedOutput_withStateHC() are equivalent to previously described functions.
-They just use the externally allocated memory area instead of allocating their own (on stack, or on heap).
-*/
-
-
-/**************************************
-   Streaming Functions
-**************************************/
-/* Note : these streaming functions still follows the older model */
-void* LZ4_createHC (const char* inputBuffer);
-int   LZ4_compressHC_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize);
-int   LZ4_compressHC_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize);
-char* LZ4_slideInputBufferHC (void* LZ4HC_Data);
-int   LZ4_freeHC (void* LZ4HC_Data);
-
-int   LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel);
-int   LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
-
-/*
-These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks.
-In order to achieve this, it is necessary to start creating the LZ4HC Data Structure, thanks to the function :
-
-void* LZ4_createHC (const char* inputBuffer);
-The result of the function is the (void*) pointer on the LZ4HC Data Structure.
-This pointer will be needed in all other functions.
-If the pointer returned is NULL, then the allocation has failed, and compression must be aborted.
-The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer.
-The input buffer must be already allocated, and size at least 192KB.
-'inputBuffer' will also be the 'const char* source' of the first block.
-
-All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'.
-To compress each block, use either LZ4_compressHC_continue() or LZ4_compressHC_limitedOutput_continue().
-Their behavior are identical to LZ4_compressHC() or LZ4_compressHC_limitedOutput(),
-but require the LZ4HC Data Structure as their first argument, and check that each block starts right after the previous one.
-If next block does not begin immediately after the previous one, the compression will fail (return 0).
-
-When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to :
-char* LZ4_slideInputBufferHC(void* LZ4HC_Data);
-must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer.
-Note that, for this function to work properly, minimum size of an input buffer must be 192KB.
-==> The memory position where the next input data block must start is provided as the result of the function.
-
-Compression can then resume, using LZ4_compressHC_continue() or LZ4_compressHC_limitedOutput_continue(), as usual.
-
-When compression is completed, a call to LZ4_freeHC() will release the memory used by the LZ4HC Data Structure.
-*/
-
-int LZ4_sizeofStreamStateHC(void);
-int LZ4_resetStreamStateHC(void* state, const char* inputBuffer);
-
-/*
-These functions achieve the same result as :
-void* LZ4_createHC (const char* inputBuffer);
-
-They are provided here to allow the user program to allocate memory using its own routines.
-
-To know how much space must be allocated, use LZ4_sizeofStreamStateHC();
-Note also that space must be aligned for pointers (32 or 64 bits).
-
-Once space is allocated, you must initialize it using : LZ4_resetStreamStateHC(void* state, const char* inputBuffer);
-void* state is a pointer to the space allocated.
-It must be aligned for pointers (32 or 64 bits), and be large enough.
-The parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer.
-The input buffer must be already allocated, and size at least 192KB.
-'inputBuffer' will also be the 'const char* source' of the first block.
-
-The same space can be re-used multiple times, just by initializing it each time with LZ4_resetStreamState().
-return value of LZ4_resetStreamStateHC() must be 0 is OK.
-Any other value means there was an error (typically, state is not aligned for pointers (32 or 64 bits)).
-*/
-
-
-#if defined (__cplusplus)
-}
-#endif
diff --git a/c-blosc/scripts/travis-before-install.sh b/c-blosc/scripts/travis-before-install.sh
new file mode 100755
index 000000000..8dacf772f
--- /dev/null
+++ b/c-blosc/scripts/travis-before-install.sh
@@ -0,0 +1,16 @@
+#/bin/sh -f
+
+# things to do for travis-ci in the before_install section
+
+if ( test "`uname -s`" = "Darwin" )
+then
+  #cmake v2.8.12 is installed on the Mac workers now
+  #brew update
+  #brew install cmake
+  echo
+else
+  #install a newer cmake since at this time Travis only has version 2.8.7
+  sudo add-apt-repository --yes ppa:kalakris/cmake
+  sudo apt-get update -qq
+  sudo apt-get install cmake
+fi
diff --git a/c-blosc/tests/CMakeLists.txt b/c-blosc/tests/CMakeLists.txt
index a76d670b1..258d9fc26 100644
--- a/c-blosc/tests/CMakeLists.txt
+++ b/c-blosc/tests/CMakeLists.txt
@@ -7,8 +7,90 @@ link_directories(${PROJECT_BINARY_DIR}/blosc)
 
 # targets and tests
 foreach(source ${SOURCES})
+    # Enable support for testing accelerated shuffles
+    if(COMPILER_SUPPORT_SSE2)
+        # Define a symbol so tests for SSE2 shuffle/unshuffle will be compiled in.
+        set_property(
+            SOURCE ${source}
+            APPEND PROPERTY COMPILE_DEFINITIONS SHUFFLE_SSE2_ENABLED)
+    endif(COMPILER_SUPPORT_SSE2)
+#    if(COMPILER_SUPPORT_AVX2)
+#        # Define a symbol so tests for AVX2 shuffle/unshuffle will be compiled in.
+#        set_property(
+#            SOURCE ${source}
+#            APPEND PROPERTY COMPILE_DEFINITIONS SHUFFLE_AVX2_ENABLED)
+#    endif(COMPILER_SUPPORT_AVX2)
+
     get_filename_component(target ${source} NAME_WE)
     add_executable(${target} ${source})
-    target_link_libraries(${target} blosc_shared)
-    add_test(test_${target} ${target})
+
+    # Define the BLOSC_TESTING symbol so normally-hidden functions
+    # aren't hidden from the view of the test programs.
+    set_property(
+        TARGET ${target}
+        APPEND PROPERTY COMPILE_DEFINITIONS BLOSC_TESTING)
+
+    # have to copy dlls for Visual Studio
+    if(MSVC)
+        add_custom_command(
+            TARGET      ${target}
+            POST_BUILD
+            COMMAND     ${CMAKE_COMMAND}
+            ARGS        -E copy_if_different
+                        "${PROJECT_BINARY_DIR}/blosc/\$\(Configuration\)/blosc_testing.dll"
+                        "${CMAKE_CURRENT_BINARY_DIR}/\$\(Configuration\)/blosc_testing.dll")
+    elseif(MINGW)
+        add_custom_command(
+            TARGET      ${target}
+            POST_BUILD
+            COMMAND     ${CMAKE_COMMAND}
+            ARGS        -E copy_if_different
+                        "${PROJECT_BINARY_DIR}/blosc/libblosc_testing.dll"
+                        "${CMAKE_CURRENT_BINARY_DIR}/libblosc_testing.dll")
+    endif()
+
+    target_link_libraries(${target} blosc_testing)
+
+    # If there's a CSV file present for this test, read it to get the list
+    # of test parameters then add a test for each parameter set.
+    # Otherwise, this is a simple test so just add it once.
+    get_filename_component(source_extension ${source} EXT)
+    string(REGEX REPLACE "${source_extension}$" ".csv"
+        test_params_file ${source})
+    if (EXISTS "${test_params_file}")
+        # Read the file contents into a CMake list
+        file(READ "${test_params_file}" test_params_contents)
+
+        string(REGEX REPLACE ";" "\\\\;"
+            test_params_contents "${test_params_contents}")
+        string(REGEX REPLACE "\n" ";"
+            test_params_contents "${test_params_contents}")
+
+        # How many parameter sets for this test?
+        # If there's not at least one (accounting for the CSV header line),
+        # that's probably not correct so emit an error and stop configuring.
+        list(LENGTH test_params_contents test_params_count)
+        if ("${test_params_count}" LESS 2)
+            message(ERROR "Invalid test parameters file: ${test_params_file}")
+        endif()
+
+        # Remove the header line.
+        list(REMOVE_AT test_params_contents 0)
+
+        # Add a test for each parameter set in the file.
+        foreach(test_params_raw ${test_params_contents})
+            string(REGEX REPLACE "," " " test_params "${test_params_raw}")
+
+            # Create the test name.
+            # NOTE: The documentation for add_test says the test name "may not contain
+            # spaces, quotes, or other characters special in CMake syntax."
+            string(REGEX REPLACE "\"| " "_" test_name_params "${test_params}")
+            set(test_name "${target}_${test_name_params}")
+
+            separate_arguments(test_params)
+            add_test(${test_name} ${target} ${test_params})
+        endforeach()
+    else()
+        add_test(${target} ${target})
+    endif()
 endforeach(source)
diff --git a/c-blosc/tests/Makefile b/c-blosc/tests/Makefile
index 5f9e3e475..8a68067ea 100644
--- a/c-blosc/tests/Makefile
+++ b/c-blosc/tests/Makefile
@@ -9,7 +9,7 @@ SOURCES := $(wildcard *.c)
 EXECUTABLES := $(patsubst %.c, %.exe, $(SOURCES))
 
 # Support for internal LZ4 and LZ4HC
-LZ4_DIR = ../internal-complibs/lz4-r119
+LZ4_DIR = ../internal-complibs/lz4-1.7.2
 CFLAGS += -DHAVE_LZ4 -I$(LZ4_DIR)
 BLOSC_LIB += $(wildcard $(LZ4_DIR)/*.c)
 
diff --git a/c-blosc/tests/gcc-segfault-issue.c b/c-blosc/tests/gcc-segfault-issue.c
new file mode 100644
index 000000000..76c617bd0
--- /dev/null
+++ b/c-blosc/tests/gcc-segfault-issue.c
@@ -0,0 +1,80 @@
+/*
+    Copyright (C) 2016  Francesc Alted
+    http://blosc.org
+    License: MIT (see LICENSE.txt)
+
+    Test program trying to replicate the python-blosc issue:
+
+    https://github.com/Blosc/python-blosc/issues/110
+
+    Apparently this only affects to blosc-powered Python extensions.
+
+    To compile this program:
+
+    $ gcc -O3 gcc-segfault-issue.c -o gcc-segfault-issue -lblosc
+
+    To run:
+
+    $ ./gcc-segfault-issue
+    Blosc version info: 1.8.1.dev ($Date:: 2016-03-31 #$)
+    Compression: 8000000 -> 73262 (109.2x)
+
+    To check that everything goes well:
+
+    $ time for i in {1..1000}; do ./gcc-segfault-issue > p ; done
+
+    real    0m4.590s
+    user    0m2.516s
+    sys     0m1.884s
+
+    If you don't see any "Segmentation fault (core dumped)", the
+    C-Blosc library itself is probably not a victim of the infamous
+    issue above that only seems to affect Python extensions.
+
+*/
+
+#include <stdio.h>
+#include <blosc.h>
+
+#define SIZE 1000*1000
+
+int main(){
+  static double data[SIZE];
+  static double data_out[SIZE];
+  static double data_dest[SIZE];
+  int isize = SIZE*sizeof(double), osize = SIZE*sizeof(double);
+  int dsize = SIZE*sizeof(double), csize;
+  int i;
+
+  for(i=0; i<SIZE; i++){
+    data[i] = i;
+  }
+
+  /* Register the filter with the library */
+  printf("Blosc version info: %s (%s)\n",
+	 BLOSC_VERSION_STRING, BLOSC_VERSION_DATE);
+
+  /* Initialize the gobal Blosc context */
+  blosc_init();
+
+  /* Use multithreading */
+  blosc_set_nthreads(3);
+
+  /* Compress with clevel=9 and shuffle active */
+  csize = blosc_compress(9, 1, sizeof(double), isize, data, data_out, osize);
+  if (csize == 0) {
+    printf("Buffer is uncompressible.  Giving up.\n");
+    return 1;
+  }
+  else if (csize < 0) {
+    printf("Compression error.  Error code: %d\n", csize);
+    return csize;
+  }
+
+  printf("Compression: %d -> %d (%.1fx)\n", isize, csize, (1.*isize) / csize);
+
+  /* Destroy the global Blosc context */
+  blosc_destroy();
+
+  return 0;
+}
diff --git a/c-blosc/tests/test_all.sh b/c-blosc/tests/test_all.sh
index b18cb56c8..2d772f7a8 100644
--- a/c-blosc/tests/test_all.sh
+++ b/c-blosc/tests/test_all.sh
@@ -1,10 +1,10 @@
 #*********************************************************************
-#  Blosc - Blocked Suffling and Compression Library
+#  Blosc - Blocked Shuffling and Compression Library
 #
 #  Unit tests for basic features in Blosc.
 #
 #  Creation date: 2010-06-07
-#  Author: Francesc Alted <francesc@blosc.io>
+#  Author: Francesc Alted <francesc@blosc.org>
 #
 #  See LICENSES/BLOSC.txt for details about copyright and rights to use.
 #**********************************************************************
diff --git a/c-blosc/tests/test_api.c b/c-blosc/tests/test_api.c
index 201b2c5e3..455ec10a0 100644
--- a/c-blosc/tests/test_api.c
+++ b/c-blosc/tests/test_api.c
@@ -1,10 +1,10 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
   Unit tests for Blosc API.
 
   Creation date: 2010-06-07
-  Author: Francesc Alted <francesc@blosc.io>
+  Author: Francesc Alted <francesc@blosc.org>
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
@@ -73,6 +73,7 @@ static char *all_tests() {
   return 0;
 }
 
+#define BUFFER_ALIGN_SIZE   8
 
 int main(int argc, char **argv) {
   char *result;
@@ -83,10 +84,10 @@ int main(int argc, char **argv) {
   blosc_set_nthreads(1);
 
   /* Initialize buffers */
-  src = malloc(size);
-  srccpy = malloc(size);
-  dest = malloc(size);
-  dest2 = malloc(size);
+  src = blosc_test_malloc(BUFFER_ALIGN_SIZE, size);
+  srccpy = blosc_test_malloc(BUFFER_ALIGN_SIZE, size);
+  dest = blosc_test_malloc(BUFFER_ALIGN_SIZE, size);
+  dest2 = blosc_test_malloc(BUFFER_ALIGN_SIZE, size);
   memset(src, 0, size);
   memcpy(srccpy, src, size);
 
@@ -106,7 +107,11 @@ int main(int argc, char **argv) {
   }
   printf("\tTests run: %d\n", tests_run);
 
-  free(src); free(srccpy); free(dest); free(dest2);
+  blosc_test_free(src);
+  blosc_test_free(srccpy);
+  blosc_test_free(dest);
+  blosc_test_free(dest2);
+
   blosc_destroy();
 
   return result != 0;
diff --git a/c-blosc/tests/test_basics.c b/c-blosc/tests/test_basics.c
deleted file mode 100644
index af5ad65c6..000000000
--- a/c-blosc/tests/test_basics.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
-
-  Unit tests for basic features in Blosc.
-
-  Creation date: 2010-06-07
-  Author: Francesc Alted <francesc@blosc.io>
-
-  See LICENSES/BLOSC.txt for details about copyright and rights to use.
-**********************************************************************/
-
-#include "test_common.h"
-
-int tests_run = 0;
-
-/* Global vars */
-void *src, *srccpy, *dest, *dest2;
-size_t nbytes, cbytes;
-int clevel = 1;
-int doshuffle = 0;
-size_t typesize = 4;
-size_t size = 1000;             /* must be divisible by 4 */
-
-
-/* Check maxout with maxout < size */
-static char *test_maxout_less() {
-
-  /* Get a compressed buffer */
-  cbytes = blosc_compress(clevel, doshuffle, typesize, size, src,
-                          dest, size+15);
-  mu_assert("ERROR: cbytes is not 0", cbytes == 0);
-
-  return 0;
-}
-
-/* Check maxout with maxout == size */
-static char *test_maxout_equal() {
-
-  /* Get a compressed buffer */
-  cbytes = blosc_compress(clevel, doshuffle, typesize, size, src,
-                          dest, size+16);
-  mu_assert("ERROR: cbytes is not correct", cbytes == size+16);
-
-  /* Decompress the buffer */
-  nbytes = blosc_decompress(dest, dest2, size);
-  mu_assert("ERROR: nbytes incorrect(1)", nbytes == size);
-
-  return 0;
-}
-
-
-/* Check maxout with maxout > size */
-static char *test_maxout_great() {
-  /* Get a compressed buffer */
-  cbytes = blosc_compress(clevel, doshuffle, typesize, size, src,
-                          dest, size+17);
-  mu_assert("ERROR: cbytes is not 0", cbytes == size+16);
-
-  /* Decompress the buffer */
-  nbytes = blosc_decompress(dest, dest2, size);
-  mu_assert("ERROR: nbytes incorrect(1)", nbytes == size);
-
-  return 0;
-}
-
-static char * test_shuffle()
-{
-  int sizes[] = {7, 64 * 3, 7*256, 500, 8000, 100000, 702713};
-  int types[] = {1, 2, 3, 4, 5, 6, 7, 8, 16};
-  int i, j, k;
-  int ok;
-  for (i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) {
-    for (j = 0; j < sizeof(types) / sizeof(types[0]); j++) {
-      int n = sizes[i];
-      int t = types[j];
-      char * d = malloc(t * n);
-      char * d2 = malloc(t * n);
-      char * o = malloc(t * n + BLOSC_MAX_OVERHEAD);
-      for (k = 0; k < n; k++) {
-        d[k] = rand();
-      }
-      blosc_compress(5, 1, t, t * n, d, o, t * n + BLOSC_MAX_OVERHEAD);
-      blosc_decompress(o, d2, t * n);
-      ok = 1;
-      for (k = 0; ok && k < n; k++) {
-        ok = (d[k] == d2[k]);
-      }
-      free(d);
-      free(d2);
-      free(o);
-      mu_assert("ERROR: shuffle test failed", ok);
-    }
-  }
-
-  return 0;
-}
-
-static char * test_noshuffle()
-{
-  int sizes[] = {7, 64 * 3, 7*256, 500, 8000, 100000, 702713};
-  int types[] = {1, 2, 3, 4, 5, 6, 7, 8, 16};
-  int i, j, k;
-  int ok;
-  for (i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) {
-    for (j = 0; j < sizeof(types) / sizeof(types[0]); j++) {
-      int n = sizes[i];
-      int t = types[j];
-      char * d = malloc(t * n);
-      char * d2 = malloc(t * n);
-      char * o = malloc(t * n + BLOSC_MAX_OVERHEAD);
-      for (k = 0; k < n; k++) {
-        d[k] = rand();
-      }
-      blosc_compress(5, 0, t, t * n, d, o, t * n + BLOSC_MAX_OVERHEAD);
-      blosc_decompress(o, d2, t * n);
-      ok = 1;
-      for (k = 0; ok && k < n; k++) {
-        ok = (d[k] == d2[k]);
-      }
-      free(d);
-      free(d2);
-      free(o);
-      mu_assert("ERROR: noshuffle test failed", ok);
-    }
-  }
-
-  return 0;
-}
-
-static char * test_getitem()
-{
-  int sizes[] = {7, 64 * 3, 7*256, 500, 8000, 100000, 702713};
-  int types[] = {1, 2, 3, 4, 5, 6, 7, 8, 16};
-  int i, j, k;
-  int ok;
-  for (i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) {
-    for (j = 0; j < sizeof(types) / sizeof(types[0]); j++) {
-      int n = sizes[i];
-      int t = types[j];
-      char * d = malloc(t * n);
-      char * d2 = malloc(t * n);
-      char * o = malloc(t * n + BLOSC_MAX_OVERHEAD);
-      for (k = 0; k < n; k++) {
-        d[k] = rand();
-      }
-      blosc_compress(5, 1, t, t * n, d, o, t * n + BLOSC_MAX_OVERHEAD);
-      blosc_getitem(o, 0, n, d2);
-      ok = 1;
-      for (k = 0; ok && k < n; k++) {
-        ok = (d[k] == d2[k]);
-      }
-      free(d);
-      free(d2);
-      free(o);
-      mu_assert("ERROR: getitem test failed", ok);
-    }
-  }
-
-  return 0;
-}
-
-static char *all_tests() {
-  mu_run_test(test_maxout_less);
-  mu_run_test(test_maxout_equal);
-  mu_run_test(test_maxout_great);
-  mu_run_test(test_shuffle);
-  mu_run_test(test_noshuffle);
-  mu_run_test(test_getitem);
-  return 0;
-}
-
-int main(int argc, char **argv) {
-  size_t i;
-  int32_t *_src;
-  char *result;
-
-  printf("STARTING TESTS for %s", argv[0]);
-
-  blosc_init();
-  blosc_set_nthreads(1);
-
-  /* Initialize buffers */
-  src = malloc(size);
-  srccpy = malloc(size);
-  dest = malloc(size+16);
-  dest2 = malloc(size);
-  _src = (int32_t *)src;
-  for (i=0; i < (size/4); i++) {
-    _src[i] = i;
-  }
-  memcpy(srccpy, src, size);
-
-  /* Run all the suite */
-  result = all_tests();
-  if (result != 0) {
-    printf(" (%s)\n", result);
-  }
-  else {
-    printf(" ALL TESTS PASSED");
-  }
-  printf("\tTests run: %d\n", tests_run);
-
-  free(src); free(srccpy); free(dest); free(dest2);
-  blosc_destroy();
-
-  return result != 0;
-}
diff --git a/c-blosc/tests/test_common.h b/c-blosc/tests/test_common.h
index c4c8c3eb0..c97c0b5d0 100644
--- a/c-blosc/tests/test_common.h
+++ b/c-blosc/tests/test_common.h
@@ -1,14 +1,17 @@
 /*********************************************************************
-  Blosc - Blocked Suffling and Compression Library
+  Blosc - Blocked Shuffling and Compression Library
 
   Unit tests for basic features in Blosc.
 
   Creation date: 2010-06-07
-  Author: Francesc Alted <francesc@blosc.io>
+  Author: Francesc Alted <francesc@blosc.org>
 
   See LICENSES/BLOSC.txt for details about copyright and rights to use.
 **********************************************************************/
 
+#ifndef BLOSC_TEST_COMMON_H
+#define BLOSC_TEST_COMMON_H
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -19,6 +22,7 @@
   #include <time.h>
   #include "win32/stdint-windows.h"
 #else
+  #include <stdint.h>
   #include <unistd.h>
   #include <sys/time.h>
 #endif
@@ -38,3 +42,106 @@ extern int tests_run;
 #define KB  1024
 #define MB  (1024*KB)
 #define GB  (1024*MB)
+
+/*
+  Memory functions.
+*/
+
+/** Allocates a block of memory with the specified size and alignment.
+    The allocated memory is 'cleaned' before returning to avoid
+    accidental re-use of data within or between tests.
+ */
+static void* blosc_test_malloc(const size_t alignment, const size_t size)
+{
+  const int32_t clean_value = 0x99;
+  void *block = NULL;
+  int32_t res = 0;
+
+#if _ISOC11_SOURCE
+  /* C11 aligned allocation. 'size' must be a multiple of the alignment. */
+  block = aligned_alloc(alignment, size);
+#elif defined(_WIN32)
+  /* A (void *) cast needed for avoiding a warning with MINGW :-/ */
+  block = (void *)_aligned_malloc(size, alignment);
+#elif _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600
+  /* Platform does have an implementation of posix_memalign */
+  res = posix_memalign(&block, alignment, size);
+#elif defined(__APPLE__)
+  /* Mac OS X guarantees 16-byte alignment in small allocs */
+  block = malloc(size);
+#else
+  #error Cannot determine how to allocate aligned memory on the target platform.
+#endif
+
+  if (block == NULL || res != 0) {
+    fprintf(stderr, "Error allocating memory!");
+    return NULL;
+  }
+
+  /* Clean the allocated memory before returning. */
+  memset(block, clean_value, size);
+
+  return block;
+}
+
+/** Frees memory allocated by blosc_test_malloc. */
+static void blosc_test_free(void* ptr)
+{
+#if defined(_WIN32)
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif  /* _WIN32 */
+}
+
+/** Fills a buffer with random values. */
+static void blosc_test_fill_random(void* const ptr, const size_t size)
+{
+  size_t k;
+  uint8_t* const byte_ptr = (uint8_t*)ptr;
+  for (k = 0; k < size; k++) {
+    byte_ptr[k] = rand();
+  }
+}
+
+/*
+  Argument parsing.
+*/
+
+/** Parse a `int32_t` value from a string, checking for overflow. */
+static int blosc_test_parse_uint32_t(const char* const str, uint32_t* value)
+{
+  char* str_end;
+  int32_t signed_value = strtol(str, &str_end, 10);
+  if (signed_value < 0 || *str_end)
+  {
+    return 0;
+  }
+  else
+  {
+    *value = (uint32_t)signed_value;
+    return 1;
+  }
+}
+
+/*
+  Error message functions.
+*/
+
+/** Print an error message when a test program has been invoked
+    with an invalid number of arguments. */
+static void blosc_test_print_bad_argcount_msg(
+  const int32_t num_expected_args, const int32_t num_actual_args)
+{
+  fprintf(stderr, "Invalid number of arguments specified.\nExpected %d arguments but was given %d.",
+    num_expected_args, num_actual_args);
+}
+
+/** Print an error message when a test program has been invoked
+    with an invalid argument value. */
+static void blosc_test_print_bad_arg_msg(const int32_t arg_index)
+{
+  fprintf(stderr, "Invalid value specified for argument at index %d.\n", arg_index);
+}
+
+#endif  /* !defined(BLOSC_TEST_COMMON_H) */
diff --git a/c-blosc/tests/test_compress_roundtrip.c b/c-blosc/tests/test_compress_roundtrip.c
new file mode 100644
index 000000000..e981f3f50
--- /dev/null
+++ b/c-blosc/tests/test_compress_roundtrip.c
@@ -0,0 +1,134 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Roundtrip compression/decompression tests.
+
+  Creation date: 2010-06-07
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "test_common.h"
+#include "../blosc/shuffle.h"
+#include "../blosc/shuffle-generic.h"
+
+
+/** Perform a compress + decompress round trip. */
+static int test_compress_roundtrip(size_t type_size, size_t num_elements,
+  size_t buffer_alignment, int compression_level, int do_shuffle)
+{
+  size_t buffer_size = type_size * num_elements;
+  int exit_code;
+
+  /* Allocate memory for the test. */
+  void* original = blosc_test_malloc(buffer_alignment, buffer_size);
+  void* intermediate = blosc_test_malloc(buffer_alignment, buffer_size + BLOSC_MAX_OVERHEAD);
+  void* result = blosc_test_malloc(buffer_alignment, buffer_size);
+
+  /* Fill the input data buffer with random values. */
+  blosc_test_fill_random(original, buffer_size);
+
+  /* Compress the input data and store it in an intermediate buffer.
+     Decompress the data from the intermediate buffer into a result buffer. */
+  blosc_compress(compression_level, do_shuffle, type_size, buffer_size,
+    original, intermediate, buffer_size + BLOSC_MAX_OVERHEAD);
+  blosc_decompress(intermediate, result, buffer_size);
+
+  /* The round-tripped data matches the original data when the
+     result of memcmp is 0. */
+  exit_code = memcmp(original, result, buffer_size) ?
+    EXIT_FAILURE : EXIT_SUCCESS;
+
+  /* Free allocated memory. */
+  blosc_test_free(original);
+  blosc_test_free(intermediate);
+  blosc_test_free(result);
+
+  return exit_code;
+}
+
+/** Required number of arguments to this test, including the executable name. */
+#define TEST_ARG_COUNT  7
+
+int main(int argc, char **argv)
+{
+  int shuffle_enabled;
+  uint32_t blosc_thread_count;
+  uint32_t type_size;
+  uint32_t num_elements;
+  uint32_t buffer_align_size;
+  uint32_t compression_level;
+  int result;
+
+  /*  argv[1]: sizeof(element type)
+      argv[2]: number of elements
+      argv[3]: buffer alignment
+      argv[4]: compression level
+      argv[5]: shuffle enabled
+      argv[6]: thread count
+  */
+
+  /*  Verify the correct number of command-line args have been specified. */
+  if (TEST_ARG_COUNT != argc)
+  {
+    blosc_test_print_bad_argcount_msg(TEST_ARG_COUNT, argc);
+    return EXIT_FAILURE;
+  }
+
+  /* Parse arguments */
+  if (!blosc_test_parse_uint32_t(argv[1], &type_size) || (type_size < 1))
+  {
+    blosc_test_print_bad_arg_msg(1);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[2], &num_elements) || (num_elements < 1))
+  {
+    blosc_test_print_bad_arg_msg(2);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[3], &buffer_align_size)
+    || (buffer_align_size & (buffer_align_size - 1))
+    || (buffer_align_size < sizeof(void*)))
+  {
+    blosc_test_print_bad_arg_msg(3);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[4], &compression_level) || (compression_level > 9))
+  {
+    blosc_test_print_bad_arg_msg(4);
+    return EXIT_FAILURE;
+  }
+
+  {
+    uint32_t shuffle_enabled_raw;
+    if (!blosc_test_parse_uint32_t(argv[5], &shuffle_enabled_raw) || (shuffle_enabled_raw > 1))
+    {
+      blosc_test_print_bad_arg_msg(5);
+      return EXIT_FAILURE;
+    }
+    shuffle_enabled = shuffle_enabled_raw == 0 ? 0 : 1;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[6], &blosc_thread_count) || (blosc_thread_count < 1))
+  {
+    blosc_test_print_bad_arg_msg(6);
+    return EXIT_FAILURE;
+  }
+
+  /* Initialize blosc before running tests. */
+  blosc_init();
+  blosc_set_nthreads(blosc_thread_count);
+
+  /* Run the test. */
+  result = test_compress_roundtrip(type_size, num_elements, buffer_align_size,
+    compression_level, shuffle_enabled);
+
+  /* Cleanup blosc resources. */
+  blosc_destroy();
+
+  return result;
+}
diff --git a/c-blosc/tests/test_compress_roundtrip.csv b/c-blosc/tests/test_compress_roundtrip.csv
new file mode 100644
index 000000000..614bf7a76
--- /dev/null
+++ b/c-blosc/tests/test_compress_roundtrip.csv
@@ -0,0 +1,267 @@
+"Size of element type (bytes)","Number of elements","Buffer alignment size (bytes)","Compression level","Shuffle enabled","Blosc thread count"
+1,7,32,5,0,1
+1,192,32,5,0,1
+1,1792,32,5,0,1
+1,500,32,5,0,1
+1,8000,32,5,0,1
+1,100000,32,5,0,1
+1,702713,32,5,0,1
+2,7,32,5,0,1
+2,192,32,5,0,1
+2,1792,32,5,0,1
+2,500,32,5,0,1
+2,8000,32,5,0,1
+2,100000,32,5,0,1
+2,702713,32,5,0,1
+3,7,32,5,0,1
+3,192,32,5,0,1
+3,1792,32,5,0,1
+3,500,32,5,0,1
+3,8000,32,5,0,1
+3,100000,32,5,0,1
+3,702713,32,5,0,1
+4,7,32,5,0,1
+4,192,32,5,0,1
+4,1792,32,5,0,1
+4,500,32,5,0,1
+4,8000,32,5,0,1
+4,100000,32,5,0,1
+4,702713,32,5,0,1
+5,7,32,5,0,1
+5,192,32,5,0,1
+5,1792,32,5,0,1
+5,500,32,5,0,1
+5,8000,32,5,0,1
+5,100000,32,5,0,1
+5,702713,32,5,0,1
+6,7,32,5,0,1
+6,192,32,5,0,1
+6,1792,32,5,0,1
+6,500,32,5,0,1
+6,8000,32,5,0,1
+6,100000,32,5,0,1
+6,702713,32,5,0,1
+7,7,32,5,0,1
+7,192,32,5,0,1
+7,1792,32,5,0,1
+7,500,32,5,0,1
+7,8000,32,5,0,1
+7,100000,32,5,0,1
+7,702713,32,5,0,1
+8,7,32,5,0,1
+8,192,32,5,0,1
+8,1792,32,5,0,1
+8,500,32,5,0,1
+8,8000,32,5,0,1
+8,100000,32,5,0,1
+8,702713,32,5,0,1
+11,7,32,5,0,1
+11,192,32,5,0,1
+11,1792,32,5,0,1
+11,500,32,5,0,1
+11,8000,32,5,0,1
+11,100000,32,5,0,1
+11,702713,32,5,0,1
+16,7,32,5,0,1
+16,192,32,5,0,1
+16,1792,32,5,0,1
+16,500,32,5,0,1
+16,8000,32,5,0,1
+16,100000,32,5,0,1
+16,702713,32,5,0,1
+22,7,32,5,0,1
+22,192,32,5,0,1
+22,1792,32,5,0,1
+22,500,32,5,0,1
+22,8000,32,5,0,1
+22,100000,32,5,0,1
+22,702713,32,5,0,1
+30,7,32,5,0,1
+30,192,32,5,0,1
+30,1792,32,5,0,1
+30,500,32,5,0,1
+30,8000,32,5,0,1
+30,100000,32,5,0,1
+30,702713,32,5,0,1
+32,7,32,5,0,1
+32,192,32,5,0,1
+32,1792,32,5,0,1
+32,500,32,5,0,1
+32,8000,32,5,0,1
+32,100000,32,5,0,1
+32,702713,32,5,0,1
+42,7,32,5,0,1
+42,192,32,5,0,1
+42,1792,32,5,0,1
+42,500,32,5,0,1
+42,8000,32,5,0,1
+42,100000,32,5,0,1
+42,702713,32,5,0,1
+48,7,32,5,0,1
+48,192,32,5,0,1
+48,1792,32,5,0,1
+48,500,32,5,0,1
+48,8000,32,5,0,1
+48,100000,32,5,0,1
+48,702713,32,5,0,1
+52,7,32,5,0,1
+52,192,32,5,0,1
+52,1792,32,5,0,1
+52,500,32,5,0,1
+52,8000,32,5,0,1
+52,100000,32,5,0,1
+52,702713,32,5,0,1
+53,7,32,5,0,1
+53,192,32,5,0,1
+53,1792,32,5,0,1
+53,500,32,5,0,1
+53,8000,32,5,0,1
+53,100000,32,5,0,1
+53,702713,32,5,0,1
+64,7,32,5,0,1
+64,192,32,5,0,1
+64,1792,32,5,0,1
+64,500,32,5,0,1
+64,8000,32,5,0,1
+64,100000,32,5,0,1
+64,702713,32,5,0,1
+80,7,32,5,0,1
+80,192,32,5,0,1
+80,1792,32,5,0,1
+80,500,32,5,0,1
+80,8000,32,5,0,1
+80,100000,32,5,0,1
+80,702713,32,5,0,1
+1,7,32,5,1,1
+1,192,32,5,1,1
+1,1792,32,5,1,1
+1,500,32,5,1,1
+1,8000,32,5,1,1
+1,100000,32,5,1,1
+1,702713,32,5,1,1
+2,7,32,5,1,1
+2,192,32,5,1,1
+2,1792,32,5,1,1
+2,500,32,5,1,1
+2,8000,32,5,1,1
+2,100000,32,5,1,1
+2,702713,32,5,1,1
+3,7,32,5,1,1
+3,192,32,5,1,1
+3,1792,32,5,1,1
+3,500,32,5,1,1
+3,8000,32,5,1,1
+3,100000,32,5,1,1
+3,702713,32,5,1,1
+4,7,32,5,1,1
+4,192,32,5,1,1
+4,1792,32,5,1,1
+4,500,32,5,1,1
+4,8000,32,5,1,1
+4,100000,32,5,1,1
+4,702713,32,5,1,1
+5,7,32,5,1,1
+5,192,32,5,1,1
+5,1792,32,5,1,1
+5,500,32,5,1,1
+5,8000,32,5,1,1
+5,100000,32,5,1,1
+5,702713,32,5,1,1
+6,7,32,5,1,1
+6,192,32,5,1,1
+6,1792,32,5,1,1
+6,500,32,5,1,1
+6,8000,32,5,1,1
+6,100000,32,5,1,1
+6,702713,32,5,1,1
+7,7,32,5,1,1
+7,192,32,5,1,1
+7,1792,32,5,1,1
+7,500,32,5,1,1
+7,8000,32,5,1,1
+7,100000,32,5,1,1
+7,702713,32,5,1,1
+8,7,32,5,1,1
+8,192,32,5,1,1
+8,1792,32,5,1,1
+8,500,32,5,1,1
+8,8000,32,5,1,1
+8,100000,32,5,1,1
+8,702713,32,5,1,1
+11,7,32,5,1,1
+11,192,32,5,1,1
+11,1792,32,5,1,1
+11,500,32,5,1,1
+11,8000,32,5,1,1
+11,100000,32,5,1,1
+11,702713,32,5,1,1
+16,7,32,5,1,1
+16,192,32,5,1,1
+16,1792,32,5,1,1
+16,500,32,5,1,1
+16,8000,32,5,1,1
+16,100000,32,5,1,1
+16,702713,32,5,1,1
+22,7,32,5,1,1
+22,192,32,5,1,1
+22,1792,32,5,1,1
+22,500,32,5,1,1
+22,8000,32,5,1,1
+22,100000,32,5,1,1
+22,702713,32,5,1,1
+30,7,32,5,1,1
+30,192,32,5,1,1
+30,1792,32,5,1,1
+30,500,32,5,1,1
+30,8000,32,5,1,1
+30,100000,32,5,1,1
+30,702713,32,5,1,1
+32,7,32,5,1,1
+32,192,32,5,1,1
+32,1792,32,5,1,1
+32,500,32,5,1,1
+32,8000,32,5,1,1
+32,100000,32,5,1,1
+32,702713,32,5,1,1
+42,7,32,5,1,1
+42,192,32,5,1,1
+42,1792,32,5,1,1
+42,500,32,5,1,1
+42,8000,32,5,1,1
+42,100000,32,5,1,1
+42,702713,32,5,1,1
+48,7,32,5,1,1
+48,192,32,5,1,1
+48,1792,32,5,1,1
+48,500,32,5,1,1
+48,8000,32,5,1,1
+48,100000,32,5,1,1
+48,702713,32,5,1,1
+52,7,32,5,1,1
+52,192,32,5,1,1
+52,1792,32,5,1,1
+52,500,32,5,1,1
+52,8000,32,5,1,1
+52,100000,32,5,1,1
+52,702713,32,5,1,1
+53,7,32,5,1,1
+53,192,32,5,1,1
+53,1792,32,5,1,1
+53,500,32,5,1,1
+53,8000,32,5,1,1
+53,100000,32,5,1,1
+53,702713,32,5,1,1
+64,7,32,5,1,1
+64,192,32,5,1,1
+64,1792,32,5,1,1
+64,500,32,5,1,1
+64,8000,32,5,1,1
+64,100000,32,5,1,1
+64,702713,32,5,1,1
+80,7,32,5,1,1
+80,192,32,5,1,1
+80,1792,32,5,1,1
+80,500,32,5,1,1
+80,8000,32,5,1,1
+80,100000,32,5,1,1
+80,702713,32,5,1,1
diff --git a/c-blosc/tests/test_getitem.c b/c-blosc/tests/test_getitem.c
new file mode 100644
index 000000000..273566ba3
--- /dev/null
+++ b/c-blosc/tests/test_getitem.c
@@ -0,0 +1,130 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Unit tests for the blosc_getitem() function.
+
+  Creation date: 2010-06-07
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "test_common.h"
+
+
+/** Test the blosc_getitem function. */
+static int test_getitem(size_t type_size, size_t num_elements,
+  size_t buffer_alignment, int compression_level, int do_shuffle)
+{
+  size_t buffer_size = type_size * num_elements;
+  int exit_code;
+
+  /* Allocate memory for the test. */
+  void* original = blosc_test_malloc(buffer_alignment, buffer_size);
+  void* intermediate = blosc_test_malloc(buffer_alignment, buffer_size + BLOSC_MAX_OVERHEAD);
+  void* result = blosc_test_malloc(buffer_alignment, buffer_size);
+
+  /* Fill the input data buffer with random values. */
+  blosc_test_fill_random(original, buffer_size);
+
+  /* Compress the input data, then use blosc_getitem to extract (decompress)
+     a range of elements into a new buffer. */
+  blosc_compress(compression_level, do_shuffle, type_size, buffer_size,
+    original, intermediate, buffer_size + BLOSC_MAX_OVERHEAD);
+  blosc_getitem(intermediate, 0, num_elements, result);
+
+  /* The round-tripped data matches the original data when the
+     result of memcmp is 0. */
+  exit_code = memcmp(original, result, buffer_size) ?
+    EXIT_FAILURE : EXIT_SUCCESS;
+
+  /* Free allocated memory. */
+  blosc_test_free(original);
+  blosc_test_free(intermediate);
+  blosc_test_free(result);
+
+  return exit_code;
+}
+
+/** Required number of arguments to this test, including the executable name. */
+#define TEST_ARG_COUNT  7
+
+int main(int argc, char **argv)
+{
+  uint32_t type_size;
+  uint32_t num_elements;
+  uint32_t buffer_align_size;
+  uint32_t compression_level;
+  uint32_t shuffle_enabled;
+  uint32_t blosc_thread_count;
+  int result;
+
+  /*  argv[1]: sizeof(element type)
+      argv[2]: number of elements
+      argv[3]: buffer alignment
+      argv[4]: compression level
+      argv[5]: shuffle enabled
+      argv[6]: thread count
+  */
+
+  /*  Verify the correct number of command-line args have been specified. */
+  if (TEST_ARG_COUNT != argc)
+  {
+    blosc_test_print_bad_argcount_msg(TEST_ARG_COUNT, argc);
+    return EXIT_FAILURE;
+  }
+
+  /* Parse arguments */
+  if (!blosc_test_parse_uint32_t(argv[1], &type_size) || (type_size < 1))
+  {
+    blosc_test_print_bad_arg_msg(1);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[2], &num_elements) || (num_elements < 1))
+  {
+    blosc_test_print_bad_arg_msg(2);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[3], &buffer_align_size)
+    || (buffer_align_size & (buffer_align_size - 1))
+    || (buffer_align_size < sizeof(void*)))
+  {
+    blosc_test_print_bad_arg_msg(3);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[4], &compression_level) || (compression_level > 9))
+  {
+    blosc_test_print_bad_arg_msg(4);
+    return EXIT_FAILURE;
+  }
+
+  {
+    if (!blosc_test_parse_uint32_t(argv[5], &shuffle_enabled) || (shuffle_enabled > 2))
+    {
+      blosc_test_print_bad_arg_msg(5);
+      return EXIT_FAILURE;
+    }
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[6], &blosc_thread_count) || (blosc_thread_count < 1))
+  {
+    blosc_test_print_bad_arg_msg(6);
+    return EXIT_FAILURE;
+  }
+
+  /* Initialize blosc before running tests. */
+  blosc_init();
+  blosc_set_nthreads(blosc_thread_count);
+
+  /* Run the test. */
+  result = test_getitem(type_size, num_elements, buffer_align_size,
+    compression_level, shuffle_enabled);
+
+  /* Cleanup blosc resources. */
+  blosc_destroy();
+
+  return result;
+}
diff --git a/c-blosc/tests/test_getitem.csv b/c-blosc/tests/test_getitem.csv
new file mode 100644
index 000000000..16ff87cb2
--- /dev/null
+++ b/c-blosc/tests/test_getitem.csv
@@ -0,0 +1,400 @@
+"Size of element type (bytes)","Number of elements","Buffer alignment size (bytes)","Compression level","Shuffle enabled","Blosc thread count"
+1,7,32,5,0,1
+1,192,32,5,0,1
+1,1792,32,5,0,1
+1,500,32,5,0,1
+1,8000,32,5,0,1
+1,100000,32,5,0,1
+1,702713,32,5,0,1
+2,7,32,5,0,1
+2,192,32,5,0,1
+2,1792,32,5,0,1
+2,500,32,5,0,1
+2,8000,32,5,0,1
+2,100000,32,5,0,1
+2,702713,32,5,0,1
+3,7,32,5,0,1
+3,192,32,5,0,1
+3,1792,32,5,0,1
+3,500,32,5,0,1
+3,8000,32,5,0,1
+3,100000,32,5,0,1
+3,702713,32,5,0,1
+4,7,32,5,0,1
+4,192,32,5,0,1
+4,1792,32,5,0,1
+4,500,32,5,0,1
+4,8000,32,5,0,1
+4,100000,32,5,0,1
+4,702713,32,5,0,1
+5,7,32,5,0,1
+5,192,32,5,0,1
+5,1792,32,5,0,1
+5,500,32,5,0,1
+5,8000,32,5,0,1
+5,100000,32,5,0,1
+5,702713,32,5,0,1
+6,7,32,5,0,1
+6,192,32,5,0,1
+6,1792,32,5,0,1
+6,500,32,5,0,1
+6,8000,32,5,0,1
+6,100000,32,5,0,1
+6,702713,32,5,0,1
+7,7,32,5,0,1
+7,192,32,5,0,1
+7,1792,32,5,0,1
+7,500,32,5,0,1
+7,8000,32,5,0,1
+7,100000,32,5,0,1
+7,702713,32,5,0,1
+8,7,32,5,0,1
+8,192,32,5,0,1
+8,1792,32,5,0,1
+8,500,32,5,0,1
+8,8000,32,5,0,1
+8,100000,32,5,0,1
+8,702713,32,5,0,1
+11,7,32,5,0,1
+11,192,32,5,0,1
+11,1792,32,5,0,1
+11,500,32,5,0,1
+11,8000,32,5,0,1
+11,100000,32,5,0,1
+11,702713,32,5,0,1
+16,7,32,5,0,1
+16,192,32,5,0,1
+16,1792,32,5,0,1
+16,500,32,5,0,1
+16,8000,32,5,0,1
+16,100000,32,5,0,1
+16,702713,32,5,0,1
+22,7,32,5,0,1
+22,192,32,5,0,1
+22,1792,32,5,0,1
+22,500,32,5,0,1
+22,8000,32,5,0,1
+22,100000,32,5,0,1
+22,702713,32,5,0,1
+30,7,32,5,0,1
+30,192,32,5,0,1
+30,1792,32,5,0,1
+30,500,32,5,0,1
+30,8000,32,5,0,1
+30,100000,32,5,0,1
+30,702713,32,5,0,1
+32,7,32,5,0,1
+32,192,32,5,0,1
+32,1792,32,5,0,1
+32,500,32,5,0,1
+32,8000,32,5,0,1
+32,100000,32,5,0,1
+32,702713,32,5,0,1
+42,7,32,5,0,1
+42,192,32,5,0,1
+42,1792,32,5,0,1
+42,500,32,5,0,1
+42,8000,32,5,0,1
+42,100000,32,5,0,1
+42,702713,32,5,0,1
+48,7,32,5,0,1
+48,192,32,5,0,1
+48,1792,32,5,0,1
+48,500,32,5,0,1
+48,8000,32,5,0,1
+48,100000,32,5,0,1
+48,702713,32,5,0,1
+52,7,32,5,0,1
+52,192,32,5,0,1
+52,1792,32,5,0,1
+52,500,32,5,0,1
+52,8000,32,5,0,1
+52,100000,32,5,0,1
+52,702713,32,5,0,1
+53,7,32,5,0,1
+53,192,32,5,0,1
+53,1792,32,5,0,1
+53,500,32,5,0,1
+53,8000,32,5,0,1
+53,100000,32,5,0,1
+53,702713,32,5,0,1
+64,7,32,5,0,1
+64,192,32,5,0,1
+64,1792,32,5,0,1
+64,500,32,5,0,1
+64,8000,32,5,0,1
+64,100000,32,5,0,1
+64,702713,32,5,0,1
+80,7,32,5,0,1
+80,192,32,5,0,1
+80,1792,32,5,0,1
+80,500,32,5,0,1
+80,8000,32,5,0,1
+80,100000,32,5,0,1
+80,702713,32,5,0,1
+1,7,32,5,1,1
+1,192,32,5,1,1
+1,1792,32,5,1,1
+1,500,32,5,1,1
+1,8000,32,5,1,1
+1,100000,32,5,1,1
+1,702713,32,5,1,1
+2,7,32,5,1,1
+2,192,32,5,1,1
+2,1792,32,5,1,1
+2,500,32,5,1,1
+2,8000,32,5,1,1
+2,100000,32,5,1,1
+2,702713,32,5,1,1
+3,7,32,5,1,1
+3,192,32,5,1,1
+3,1792,32,5,1,1
+3,500,32,5,1,1
+3,8000,32,5,1,1
+3,100000,32,5,1,1
+3,702713,32,5,1,1
+4,7,32,5,1,1
+4,192,32,5,1,1
+4,1792,32,5,1,1
+4,500,32,5,1,1
+4,8000,32,5,1,1
+4,100000,32,5,1,1
+4,702713,32,5,1,1
+5,7,32,5,1,1
+5,192,32,5,1,1
+5,1792,32,5,1,1
+5,500,32,5,1,1
+5,8000,32,5,1,1
+5,100000,32,5,1,1
+5,702713,32,5,1,1
+6,7,32,5,1,1
+6,192,32,5,1,1
+6,1792,32,5,1,1
+6,500,32,5,1,1
+6,8000,32,5,1,1
+6,100000,32,5,1,1
+6,702713,32,5,1,1
+7,7,32,5,1,1
+7,192,32,5,1,1
+7,1792,32,5,1,1
+7,500,32,5,1,1
+7,8000,32,5,1,1
+7,100000,32,5,1,1
+7,702713,32,5,1,1
+8,7,32,5,1,1
+8,192,32,5,1,1
+8,1792,32,5,1,1
+8,500,32,5,1,1
+8,8000,32,5,1,1
+8,100000,32,5,1,1
+8,702713,32,5,1,1
+11,7,32,5,1,1
+11,192,32,5,1,1
+11,1792,32,5,1,1
+11,500,32,5,1,1
+11,8000,32,5,1,1
+11,100000,32,5,1,1
+11,702713,32,5,1,1
+16,7,32,5,1,1
+16,192,32,5,1,1
+16,1792,32,5,1,1
+16,500,32,5,1,1
+16,8000,32,5,1,1
+16,100000,32,5,1,1
+16,702713,32,5,1,1
+22,7,32,5,1,1
+22,192,32,5,1,1
+22,1792,32,5,1,1
+22,500,32,5,1,1
+22,8000,32,5,1,1
+22,100000,32,5,1,1
+22,702713,32,5,1,1
+30,7,32,5,1,1
+30,192,32,5,1,1
+30,1792,32,5,1,1
+30,500,32,5,1,1
+30,8000,32,5,1,1
+30,100000,32,5,1,1
+30,702713,32,5,1,1
+32,7,32,5,1,1
+32,192,32,5,1,1
+32,1792,32,5,1,1
+32,500,32,5,1,1
+32,8000,32,5,1,1
+32,100000,32,5,1,1
+32,702713,32,5,1,1
+42,7,32,5,1,1
+42,192,32,5,1,1
+42,1792,32,5,1,1
+42,500,32,5,1,1
+42,8000,32,5,1,1
+42,100000,32,5,1,1
+42,702713,32,5,1,1
+48,7,32,5,1,1
+48,192,32,5,1,1
+48,1792,32,5,1,1
+48,500,32,5,1,1
+48,8000,32,5,1,1
+48,100000,32,5,1,1
+48,702713,32,5,1,1
+52,7,32,5,1,1
+52,192,32,5,1,1
+52,1792,32,5,1,1
+52,500,32,5,1,1
+52,8000,32,5,1,1
+52,100000,32,5,1,1
+52,702713,32,5,1,1
+53,7,32,5,1,1
+53,192,32,5,1,1
+53,1792,32,5,1,1
+53,500,32,5,1,1
+53,8000,32,5,1,1
+53,100000,32,5,1,1
+53,702713,32,5,1,1
+64,7,32,5,1,1
+64,192,32,5,1,1
+64,1792,32,5,1,1
+64,500,32,5,1,1
+64,8000,32,5,1,1
+64,100000,32,5,1,1
+64,702713,32,5,1,1
+80,7,32,5,1,1
+80,192,32,5,1,1
+80,1792,32,5,1,1
+80,500,32,5,1,1
+80,8000,32,5,1,1
+80,100000,32,5,1,1
+80,702713,32,5,1,1
+1,7,32,5,2,1
+1,192,32,5,2,1
+1,1792,32,5,2,1
+1,500,32,5,2,1
+1,8000,32,5,2,1
+1,100000,32,5,2,1
+1,702713,32,5,2,1
+2,7,32,5,2,1
+2,192,32,5,2,1
+2,1792,32,5,2,1
+2,500,32,5,2,1
+2,8000,32,5,2,1
+2,100000,32,5,2,1
+2,702713,32,5,2,1
+3,7,32,5,2,1
+3,192,32,5,2,1
+3,1792,32,5,2,1
+3,500,32,5,2,1
+3,8000,32,5,2,1
+3,100000,32,5,2,1
+3,702713,32,5,2,1
+4,7,32,5,2,1
+4,192,32,5,2,1
+4,1792,32,5,2,1
+4,500,32,5,2,1
+4,8000,32,5,2,1
+4,100000,32,5,2,1
+4,702713,32,5,2,1
+5,7,32,5,2,1
+5,192,32,5,2,1
+5,1792,32,5,2,1
+5,500,32,5,2,1
+5,8000,32,5,2,1
+5,100000,32,5,2,1
+5,702713,32,5,2,1
+6,7,32,5,2,1
+6,192,32,5,2,1
+6,1792,32,5,2,1
+6,500,32,5,2,1
+6,8000,32,5,2,1
+6,100000,32,5,2,1
+6,702713,32,5,2,1
+7,7,32,5,2,1
+7,192,32,5,2,1
+7,1792,32,5,2,1
+7,500,32,5,2,1
+7,8000,32,5,2,1
+7,100000,32,5,2,1
+7,702713,32,5,2,1
+8,7,32,5,2,1
+8,192,32,5,2,1
+8,1792,32,5,2,1
+8,500,32,5,2,1
+8,8000,32,5,2,1
+8,100000,32,5,2,1
+8,702713,32,5,2,1
+11,7,32,5,2,1
+11,192,32,5,2,1
+11,1792,32,5,2,1
+11,500,32,5,2,1
+11,8000,32,5,2,1
+11,100000,32,5,2,1
+11,702713,32,5,2,1
+16,7,32,5,2,1
+16,192,32,5,2,1
+16,1792,32,5,2,1
+16,500,32,5,2,1
+16,8000,32,5,2,1
+16,100000,32,5,2,1
+16,702713,32,5,2,1
+22,7,32,5,2,1
+22,192,32,5,2,1
+22,1792,32,5,2,1
+22,500,32,5,2,1
+22,8000,32,5,2,1
+22,100000,32,5,2,1
+22,702713,32,5,2,1
+30,7,32,5,2,1
+30,192,32,5,2,1
+30,1792,32,5,2,1
+30,500,32,5,2,1
+30,8000,32,5,2,1
+30,100000,32,5,2,1
+30,702713,32,5,2,1
+32,7,32,5,2,1
+32,192,32,5,2,1
+32,1792,32,5,2,1
+32,500,32,5,2,1
+32,8000,32,5,2,1
+32,100000,32,5,2,1
+32,702713,32,5,2,1
+42,7,32,5,2,1
+42,192,32,5,2,1
+42,1792,32,5,2,1
+42,500,32,5,2,1
+42,8000,32,5,2,1
+42,100000,32,5,2,1
+42,702713,32,5,2,1
+48,7,32,5,2,1
+48,192,32,5,2,1
+48,1792,32,5,2,1
+48,500,32,5,2,1
+48,8000,32,5,2,1
+48,100000,32,5,2,1
+48,702713,32,5,2,1
+52,7,32,5,2,1
+52,192,32,5,2,1
+52,1792,32,5,2,1
+52,500,32,5,2,1
+52,8000,32,5,2,1
+52,100000,32,5,2,1
+52,702713,32,5,2,1
+53,7,32,5,2,1
+53,192,32,5,2,1
+53,1792,32,5,2,1
+53,500,32,5,2,1
+53,8000,32,5,2,1
+53,100000,32,5,2,1
+53,702713,32,5,2,1
+64,7,32,5,2,1
+64,192,32,5,2,1
+64,1792,32,5,2,1
+64,500,32,5,2,1
+64,8000,32,5,2,1
+64,100000,32,5,2,1
+64,702713,32,5,2,1
+80,7,32,5,2,1
+80,192,32,5,2,1
+80,1792,32,5,2,1
+80,500,32,5,2,1
+80,8000,32,5,2,1
+80,100000,32,5,2,1
+80,702713,32,5,2,1
diff --git a/c-blosc/tests/test_maxout.c b/c-blosc/tests/test_maxout.c
new file mode 100644
index 000000000..1603bd4cd
--- /dev/null
+++ b/c-blosc/tests/test_maxout.c
@@ -0,0 +1,117 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Unit tests for basic features in Blosc.
+
+  Creation date: 2010-06-07
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "test_common.h"
+
+int tests_run = 0;
+
+/* Global vars */
+void *src, *srccpy, *dest, *dest2;
+size_t nbytes, cbytes;
+int clevel = 1;
+int doshuffle = 0;
+size_t typesize = 4;
+size_t size = 1000;             /* must be divisible by 4 */
+
+
+/* Check maxout with maxout < size */
+static char *test_maxout_less() {
+
+  /* Get a compressed buffer */
+  cbytes = blosc_compress(clevel, doshuffle, typesize, size, src,
+                          dest, size+15);
+  mu_assert("ERROR: cbytes is not 0", cbytes == 0);
+
+  return 0;
+}
+
+
+/* Check maxout with maxout == size */
+static char *test_maxout_equal() {
+
+  /* Get a compressed buffer */
+  cbytes = blosc_compress(clevel, doshuffle, typesize, size, src,
+                          dest, size+16);
+  mu_assert("ERROR: cbytes is not correct", cbytes == size+16);
+
+  /* Decompress the buffer */
+  nbytes = blosc_decompress(dest, dest2, size);
+  mu_assert("ERROR: nbytes incorrect(1)", nbytes == size);
+
+  return 0;
+}
+
+
+/* Check maxout with maxout > size */
+static char *test_maxout_great() {
+  /* Get a compressed buffer */
+  cbytes = blosc_compress(clevel, doshuffle, typesize, size, src,
+                          dest, size+17);
+  mu_assert("ERROR: cbytes is not 0", cbytes == size+16);
+
+  /* Decompress the buffer */
+  nbytes = blosc_decompress(dest, dest2, size);
+  mu_assert("ERROR: nbytes incorrect(1)", nbytes == size);
+
+  return 0;
+}
+
+
+static char *all_tests() {
+  mu_run_test(test_maxout_less);
+  mu_run_test(test_maxout_equal);
+  mu_run_test(test_maxout_great);
+
+  return 0;
+}
+
+#define BUFFER_ALIGN_SIZE   32
+
+int main(int argc, char **argv) {
+  int32_t *_src;
+  char *result;
+  size_t i;
+
+  printf("STARTING TESTS for %s", argv[0]);
+
+  blosc_init();
+  blosc_set_nthreads(1);
+
+  /* Initialize buffers */
+  src = blosc_test_malloc(BUFFER_ALIGN_SIZE, size);
+  srccpy = blosc_test_malloc(BUFFER_ALIGN_SIZE, size);
+  dest = blosc_test_malloc(BUFFER_ALIGN_SIZE, size + 16);
+  dest2 = blosc_test_malloc(BUFFER_ALIGN_SIZE, size);
+  _src = (int32_t *)src;
+  for (i=0; i < (size/4); i++) {
+    _src[i] = (int32_t)i;
+  }
+  memcpy(srccpy, src, size);
+
+  /* Run all the suite */
+  result = all_tests();
+  if (result != 0) {
+    printf(" (%s)\n", result);
+  }
+  else {
+    printf(" ALL TESTS PASSED");
+  }
+  printf("\tTests run: %d\n", tests_run);
+
+  blosc_test_free(src);
+  blosc_test_free(srccpy);
+  blosc_test_free(dest);
+  blosc_test_free(dest2);
+
+  blosc_destroy();
+
+  return result != 0;
+}
diff --git a/c-blosc/tests/test_shuffle_roundtrip_avx2.c b/c-blosc/tests/test_shuffle_roundtrip_avx2.c
new file mode 100644
index 000000000..51b6009c7
--- /dev/null
+++ b/c-blosc/tests/test_shuffle_roundtrip_avx2.c
@@ -0,0 +1,137 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Roundtrip tests for the AVX2-accelerated shuffle/unshuffle.
+
+  Creation date: 2010-06-07
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "test_common.h"
+#include "../blosc/shuffle.h"
+#include "../blosc/shuffle-generic.h"
+
+/* Include accelerated shuffles if supported by this compiler.
+   TODO: Need to also do run-time CPU feature support here. */
+
+#if defined(SHUFFLE_AVX2_ENABLED)
+  #include "../blosc/shuffle-avx2.h"
+#else
+  #if defined(_MSC_VER)
+  #pragma message("AVX2 shuffle tests not enabled.")
+  #else
+  #warning AVX2 shuffle tests not enabled.
+  #endif
+#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
+
+
+/** Roundtrip tests for the AVX2-accelerated shuffle/unshuffle. */
+static int test_shuffle_roundtrip_avx2(size_t type_size, size_t num_elements,
+  size_t buffer_alignment, int test_type)
+{
+#if defined(SHUFFLE_AVX2_ENABLED)
+  size_t buffer_size = type_size * num_elements;
+
+  /* Allocate memory for the test. */
+  void* original = blosc_test_malloc(buffer_alignment, buffer_size);
+  void* shuffled = blosc_test_malloc(buffer_alignment, buffer_size);
+  void* unshuffled = blosc_test_malloc(buffer_alignment, buffer_size);
+
+  /* Fill the input data buffer with random values. */
+  blosc_test_fill_random(original, buffer_size);
+
+  /* Shuffle/unshuffle, selecting the implementations based on the test type. */
+  switch(test_type)
+  {
+    case 0:
+      /* avx2/avx2 */
+      shuffle_avx2(type_size, buffer_size, original, shuffled);
+      unshuffle_avx2(type_size, buffer_size, shuffled, unshuffled);
+      break;
+    case 1:
+      /* generic/avx2 */
+      shuffle_generic(type_size, buffer_size, original, shuffled);
+      unshuffle_avx2(type_size, buffer_size, shuffled, unshuffled);
+      break;
+    case 2:
+      /* avx2/generic */
+      shuffle_avx2(type_size, buffer_size, original, shuffled);
+      unshuffle_generic(type_size, buffer_size, shuffled, unshuffled);
+      break;
+    default:
+      fprintf(stderr, "Invalid test type specified (%d).", test_type);
+      return EXIT_FAILURE;
+  }
+
+  /* The round-tripped data matches the original data when the
+     result of memcmp is 0. */
+  int exit_code = memcmp(original, unshuffled, buffer_size) ?
+    EXIT_FAILURE : EXIT_SUCCESS;
+
+  /* Free allocated memory. */
+  blosc_test_free(original);
+  blosc_test_free(shuffled);
+  blosc_test_free(unshuffled);
+
+  return exit_code;
+#else
+  return EXIT_SUCCESS;
+#endif /* defined(SHUFFLE_AVX2_ENABLED) */
+}
+
+
+/** Required number of arguments to this test, including the executable name. */
+#define TEST_ARG_COUNT  5
+
+int main(int argc, char **argv)
+{
+  uint32_t type_size;
+  uint32_t num_elements;
+  uint32_t buffer_align_size;
+  uint32_t test_type;
+
+  /*  argv[1]: sizeof(element type)
+      argv[2]: number of elements
+      argv[3]: buffer alignment
+      argv[4]: test type
+  */
+
+  /*  Verify the correct number of command-line args have been specified. */
+  if (TEST_ARG_COUNT != argc)
+  {
+    blosc_test_print_bad_argcount_msg(TEST_ARG_COUNT, argc);
+    return EXIT_FAILURE;
+  }
+
+  /* Parse arguments */
+  if (!blosc_test_parse_uint32_t(argv[1], &type_size) || (type_size < 1))
+  {
+    blosc_test_print_bad_arg_msg(1);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[2], &num_elements) || (num_elements < 1))
+  {
+    blosc_test_print_bad_arg_msg(2);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[3], &buffer_align_size)
+    || (buffer_align_size & (buffer_align_size - 1))
+    || (buffer_align_size < sizeof(void*)))
+  {
+    blosc_test_print_bad_arg_msg(3);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[4], &test_type) || (test_type > 2))
+  {
+    blosc_test_print_bad_arg_msg(4);
+    return EXIT_FAILURE;
+  }
+
+  /* Run the test. */
+  return test_shuffle_roundtrip_avx2(type_size, num_elements, buffer_align_size, test_type);
+}
diff --git a/c-blosc/tests/test_shuffle_roundtrip_avx2.csv b/c-blosc/tests/test_shuffle_roundtrip_avx2.csv
new file mode 100644
index 000000000..ccff286e2
--- /dev/null
+++ b/c-blosc/tests/test_shuffle_roundtrip_avx2.csv
@@ -0,0 +1,400 @@
+"Size of element type (bytes)","Number of elements","Buffer alignment size (bytes)","Test type"
+1,7,32,0
+1,7,32,1
+1,7,32,2
+1,192,32,0
+1,192,32,1
+1,192,32,2
+1,1792,32,0
+1,1792,32,1
+1,1792,32,2
+1,500,32,0
+1,500,32,1
+1,500,32,2
+1,8000,32,0
+1,8000,32,1
+1,8000,32,2
+1,100000,32,0
+1,100000,32,1
+1,100000,32,2
+1,702713,32,0
+1,702713,32,1
+1,702713,32,2
+2,7,32,0
+2,7,32,1
+2,7,32,2
+2,192,32,0
+2,192,32,1
+2,192,32,2
+2,1792,32,0
+2,1792,32,1
+2,1792,32,2
+2,500,32,0
+2,500,32,1
+2,500,32,2
+2,8000,32,0
+2,8000,32,1
+2,8000,32,2
+2,100000,32,0
+2,100000,32,1
+2,100000,32,2
+2,702713,32,0
+2,702713,32,1
+2,702713,32,2
+3,7,32,0
+3,7,32,1
+3,7,32,2
+3,192,32,0
+3,192,32,1
+3,192,32,2
+3,1792,32,0
+3,1792,32,1
+3,1792,32,2
+3,500,32,0
+3,500,32,1
+3,500,32,2
+3,8000,32,0
+3,8000,32,1
+3,8000,32,2
+3,100000,32,0
+3,100000,32,1
+3,100000,32,2
+3,702713,32,0
+3,702713,32,1
+3,702713,32,2
+4,7,32,0
+4,7,32,1
+4,7,32,2
+4,192,32,0
+4,192,32,1
+4,192,32,2
+4,1792,32,0
+4,1792,32,1
+4,1792,32,2
+4,500,32,0
+4,500,32,1
+4,500,32,2
+4,8000,32,0
+4,8000,32,1
+4,8000,32,2
+4,100000,32,0
+4,100000,32,1
+4,100000,32,2
+4,702713,32,0
+4,702713,32,1
+4,702713,32,2
+5,7,32,0
+5,7,32,1
+5,7,32,2
+5,192,32,0
+5,192,32,1
+5,192,32,2
+5,1792,32,0
+5,1792,32,1
+5,1792,32,2
+5,500,32,0
+5,500,32,1
+5,500,32,2
+5,8000,32,0
+5,8000,32,1
+5,8000,32,2
+5,100000,32,0
+5,100000,32,1
+5,100000,32,2
+5,702713,32,0
+5,702713,32,1
+5,702713,32,2
+6,7,32,0
+6,7,32,1
+6,7,32,2
+6,192,32,0
+6,192,32,1
+6,192,32,2
+6,1792,32,0
+6,1792,32,1
+6,1792,32,2
+6,500,32,0
+6,500,32,1
+6,500,32,2
+6,8000,32,0
+6,8000,32,1
+6,8000,32,2
+6,100000,32,0
+6,100000,32,1
+6,100000,32,2
+6,702713,32,0
+6,702713,32,1
+6,702713,32,2
+7,7,32,0
+7,7,32,1
+7,7,32,2
+7,192,32,0
+7,192,32,1
+7,192,32,2
+7,1792,32,0
+7,1792,32,1
+7,1792,32,2
+7,500,32,0
+7,500,32,1
+7,500,32,2
+7,8000,32,0
+7,8000,32,1
+7,8000,32,2
+7,100000,32,0
+7,100000,32,1
+7,100000,32,2
+7,702713,32,0
+7,702713,32,1
+7,702713,32,2
+8,7,32,0
+8,7,32,1
+8,7,32,2
+8,192,32,0
+8,192,32,1
+8,192,32,2
+8,1792,32,0
+8,1792,32,1
+8,1792,32,2
+8,500,32,0
+8,500,32,1
+8,500,32,2
+8,8000,32,0
+8,8000,32,1
+8,8000,32,2
+8,100000,32,0
+8,100000,32,1
+8,100000,32,2
+8,702713,32,0
+8,702713,32,1
+8,702713,32,2
+11,7,32,0
+11,7,32,1
+11,7,32,2
+11,192,32,0
+11,192,32,1
+11,192,32,2
+11,1792,32,0
+11,1792,32,1
+11,1792,32,2
+11,500,32,0
+11,500,32,1
+11,500,32,2
+11,8000,32,0
+11,8000,32,1
+11,8000,32,2
+11,100000,32,0
+11,100000,32,1
+11,100000,32,2
+11,702713,32,0
+11,702713,32,1
+11,702713,32,2
+16,7,32,0
+16,7,32,1
+16,7,32,2
+16,192,32,0
+16,192,32,1
+16,192,32,2
+16,1792,32,0
+16,1792,32,1
+16,1792,32,2
+16,500,32,0
+16,500,32,1
+16,500,32,2
+16,8000,32,0
+16,8000,32,1
+16,8000,32,2
+16,100000,32,0
+16,100000,32,1
+16,100000,32,2
+16,702713,32,0
+16,702713,32,1
+16,702713,32,2
+22,7,32,0
+22,7,32,1
+22,7,32,2
+22,192,32,0
+22,192,32,1
+22,192,32,2
+22,1792,32,0
+22,1792,32,1
+22,1792,32,2
+22,500,32,0
+22,500,32,1
+22,500,32,2
+22,8000,32,0
+22,8000,32,1
+22,8000,32,2
+22,100000,32,0
+22,100000,32,1
+22,100000,32,2
+22,702713,32,0
+22,702713,32,1
+22,702713,32,2
+30,7,32,0
+30,7,32,1
+30,7,32,2
+30,192,32,0
+30,192,32,1
+30,192,32,2
+30,1792,32,0
+30,1792,32,1
+30,1792,32,2
+30,500,32,0
+30,500,32,1
+30,500,32,2
+30,8000,32,0
+30,8000,32,1
+30,8000,32,2
+30,100000,32,0
+30,100000,32,1
+30,100000,32,2
+30,702713,32,0
+30,702713,32,1
+30,702713,32,2
+32,7,32,0
+32,7,32,1
+32,7,32,2
+32,192,32,0
+32,192,32,1
+32,192,32,2
+32,1792,32,0
+32,1792,32,1
+32,1792,32,2
+32,500,32,0
+32,500,32,1
+32,500,32,2
+32,8000,32,0
+32,8000,32,1
+32,8000,32,2
+32,100000,32,0
+32,100000,32,1
+32,100000,32,2
+32,702713,32,0
+32,702713,32,1
+32,702713,32,2
+42,7,32,0
+42,7,32,1
+42,7,32,2
+42,192,32,0
+42,192,32,1
+42,192,32,2
+42,1792,32,0
+42,1792,32,1
+42,1792,32,2
+42,500,32,0
+42,500,32,1
+42,500,32,2
+42,8000,32,0
+42,8000,32,1
+42,8000,32,2
+42,100000,32,0
+42,100000,32,1
+42,100000,32,2
+42,702713,32,0
+42,702713,32,1
+42,702713,32,2
+48,7,32,0
+48,7,32,1
+48,7,32,2
+48,192,32,0
+48,192,32,1
+48,192,32,2
+48,1792,32,0
+48,1792,32,1
+48,1792,32,2
+48,500,32,0
+48,500,32,1
+48,500,32,2
+48,8000,32,0
+48,8000,32,1
+48,8000,32,2
+48,100000,32,0
+48,100000,32,1
+48,100000,32,2
+48,702713,32,0
+48,702713,32,1
+48,702713,32,2
+52,7,32,0
+52,7,32,1
+52,7,32,2
+52,192,32,0
+52,192,32,1
+52,192,32,2
+52,1792,32,0
+52,1792,32,1
+52,1792,32,2
+52,500,32,0
+52,500,32,1
+52,500,32,2
+52,8000,32,0
+52,8000,32,1
+52,8000,32,2
+52,100000,32,0
+52,100000,32,1
+52,100000,32,2
+52,702713,32,0
+52,702713,32,1
+52,702713,32,2
+53,7,32,0
+53,7,32,1
+53,7,32,2
+53,192,32,0
+53,192,32,1
+53,192,32,2
+53,1792,32,0
+53,1792,32,1
+53,1792,32,2
+53,500,32,0
+53,500,32,1
+53,500,32,2
+53,8000,32,0
+53,8000,32,1
+53,8000,32,2
+53,100000,32,0
+53,100000,32,1
+53,100000,32,2
+53,702713,32,0
+53,702713,32,1
+53,702713,32,2
+64,7,32,0
+64,7,32,1
+64,7,32,2
+64,192,32,0
+64,192,32,1
+64,192,32,2
+64,1792,32,0
+64,1792,32,1
+64,1792,32,2
+64,500,32,0
+64,500,32,1
+64,500,32,2
+64,8000,32,0
+64,8000,32,1
+64,8000,32,2
+64,100000,32,0
+64,100000,32,1
+64,100000,32,2
+64,702713,32,0
+64,702713,32,1
+64,702713,32,2
+80,7,32,0
+80,7,32,1
+80,7,32,2
+80,192,32,0
+80,192,32,1
+80,192,32,2
+80,1792,32,0
+80,1792,32,1
+80,1792,32,2
+80,500,32,0
+80,500,32,1
+80,500,32,2
+80,8000,32,0
+80,8000,32,1
+80,8000,32,2
+80,100000,32,0
+80,100000,32,1
+80,100000,32,2
+80,702713,32,0
+80,702713,32,1
+80,702713,32,2
\ No newline at end of file
diff --git a/c-blosc/tests/test_shuffle_roundtrip_generic.c b/c-blosc/tests/test_shuffle_roundtrip_generic.c
new file mode 100644
index 000000000..43680b5d3
--- /dev/null
+++ b/c-blosc/tests/test_shuffle_roundtrip_generic.c
@@ -0,0 +1,93 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Roundtrip tests
+
+  Creation date: 2010-06-07
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "test_common.h"
+#include "../blosc/shuffle.h"
+#include "../blosc/shuffle-generic.h"
+
+
+/** Roundtrip tests for the generic shuffle/unshuffle. */
+static int test_shuffle_roundtrip_generic(size_t type_size, size_t num_elements,
+  size_t buffer_alignment)
+{
+  size_t buffer_size = type_size * num_elements;
+  int exit_code;
+
+  /* Allocate memory for the test. */
+  void* original = blosc_test_malloc(buffer_alignment, buffer_size);
+  void* shuffled = blosc_test_malloc(buffer_alignment, buffer_size);
+  void* unshuffled = blosc_test_malloc(buffer_alignment, buffer_size);
+
+  /* Fill the input data buffer with random values. */
+  blosc_test_fill_random(original, buffer_size);
+
+  /* Generic shuffle, then generic unshuffle. */
+  shuffle_generic(type_size, buffer_size, original, shuffled);
+  unshuffle_generic(type_size, buffer_size, shuffled, unshuffled);
+
+  /* The round-tripped data matches the original data when the
+     result of memcmp is 0. */
+  exit_code = memcmp(original, unshuffled, buffer_size) ?
+    EXIT_FAILURE : EXIT_SUCCESS;
+
+  /* Free allocated memory. */
+  blosc_test_free(original);
+  blosc_test_free(shuffled);
+  blosc_test_free(unshuffled);
+
+  return exit_code;
+}
+
+/** Required number of arguments to this test, including the executable name. */
+#define TEST_ARG_COUNT  4
+
+int main(int argc, char **argv)
+{
+  uint32_t type_size;
+  uint32_t num_elements;
+  uint32_t buffer_align_size;
+
+  /*  argv[1]: sizeof(element type)
+      argv[2]: number of elements
+      argv[3]: buffer alignment
+  */
+
+  /*  Verify the correct number of command-line args have been specified. */
+  if (TEST_ARG_COUNT != argc)
+  {
+    blosc_test_print_bad_argcount_msg(TEST_ARG_COUNT, argc);
+    return EXIT_FAILURE;
+  }
+
+  /* Parse arguments */
+  if (!blosc_test_parse_uint32_t(argv[1], &type_size) || (type_size < 1))
+  {
+    blosc_test_print_bad_arg_msg(1);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[2], &num_elements) || (num_elements < 1))
+  {
+    blosc_test_print_bad_arg_msg(2);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[3], &buffer_align_size)
+    || (buffer_align_size & (buffer_align_size - 1))
+    || (buffer_align_size < sizeof(void*)))
+  {
+    blosc_test_print_bad_arg_msg(3);
+    return EXIT_FAILURE;
+  }
+
+  /* Run the test. */
+  return test_shuffle_roundtrip_generic(type_size, num_elements, buffer_align_size);
+}
diff --git a/c-blosc/tests/test_shuffle_roundtrip_generic.csv b/c-blosc/tests/test_shuffle_roundtrip_generic.csv
new file mode 100644
index 000000000..3fc6f7139
--- /dev/null
+++ b/c-blosc/tests/test_shuffle_roundtrip_generic.csv
@@ -0,0 +1,134 @@
+"Size of element type (bytes)","Number of elements","Buffer alignment size (bytes)"
+1,7,8
+1,192,8
+1,1792,8
+1,500,8
+1,8000,8
+1,100000,8
+1,702713,8
+2,7,8
+2,192,8
+2,1792,8
+2,500,8
+2,8000,8
+2,100000,8
+2,702713,8
+3,7,8
+3,192,8
+3,1792,8
+3,500,8
+3,8000,8
+3,100000,8
+3,702713,8
+4,7,8
+4,192,8
+4,1792,8
+4,500,8
+4,8000,8
+4,100000,8
+4,702713,8
+5,7,8
+5,192,8
+5,1792,8
+5,500,8
+5,8000,8
+5,100000,8
+5,702713,8
+6,7,8
+6,192,8
+6,1792,8
+6,500,8
+6,8000,8
+6,100000,8
+6,702713,8
+7,7,8
+7,192,8
+7,1792,8
+7,500,8
+7,8000,8
+7,100000,8
+7,702713,8
+8,7,8
+8,192,8
+8,1792,8
+8,500,8
+8,8000,8
+8,100000,8
+8,702713,8
+11,7,8
+11,192,8
+11,1792,8
+11,500,8
+11,8000,8
+11,100000,8
+11,702713,8
+16,7,8
+16,192,8
+16,1792,8
+16,500,8
+16,8000,8
+16,100000,8
+16,702713,8
+22,7,8
+22,192,8
+22,1792,8
+22,500,8
+22,8000,8
+22,100000,8
+22,702713,8
+30,7,8
+30,192,8
+30,1792,8
+30,500,8
+30,8000,8
+30,100000,8
+30,702713,8
+32,7,8
+32,192,8
+32,1792,8
+32,500,8
+32,8000,8
+32,100000,8
+32,702713,8
+42,7,8
+42,192,8
+42,1792,8
+42,500,8
+42,8000,8
+42,100000,8
+42,702713,8
+48,7,8
+48,192,8
+48,1792,8
+48,500,8
+48,8000,8
+48,100000,8
+48,702713,8
+52,7,8
+52,192,8
+52,1792,8
+52,500,8
+52,8000,8
+52,100000,8
+52,702713,8
+53,7,8
+53,192,8
+53,1792,8
+53,500,8
+53,8000,8
+53,100000,8
+53,702713,8
+64,7,8
+64,192,8
+64,1792,8
+64,500,8
+64,8000,8
+64,100000,8
+64,702713,8
+80,7,8
+80,192,8
+80,1792,8
+80,500,8
+80,8000,8
+80,100000,8
+80,702713,8
\ No newline at end of file
diff --git a/c-blosc/tests/test_shuffle_roundtrip_sse2.c b/c-blosc/tests/test_shuffle_roundtrip_sse2.c
new file mode 100644
index 000000000..41e0d3309
--- /dev/null
+++ b/c-blosc/tests/test_shuffle_roundtrip_sse2.c
@@ -0,0 +1,138 @@
+/*********************************************************************
+  Blosc - Blocked Shuffling and Compression Library
+
+  Roundtrip tests for the SSE2-accelerated shuffle/unshuffle.
+
+  Creation date: 2010-06-07
+  Author: Francesc Alted <francesc@blosc.org>
+
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
+**********************************************************************/
+
+#include "test_common.h"
+#include "../blosc/shuffle.h"
+#include "../blosc/shuffle-generic.h"
+
+
+/* Include SSE2-accelerated shuffle implementation if supported by this compiler.
+   TODO: Need to also do run-time CPU feature support here. */
+#if defined(SHUFFLE_SSE2_ENABLED)
+  #include "../blosc/shuffle-sse2.h"
+#else
+  #if defined(_MSC_VER)
+  #pragma message("SSE2 shuffle tests not enabled.")
+  #else
+  #warning SSE2 shuffle tests not enabled.
+  #endif
+#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
+
+
+/** Roundtrip tests for the SSE2-accelerated shuffle/unshuffle. */
+static int test_shuffle_roundtrip_sse2(size_t type_size, size_t num_elements,
+  size_t buffer_alignment, int test_type)
+{
+#if defined(SHUFFLE_SSE2_ENABLED)
+  size_t buffer_size = type_size * num_elements;
+  int exit_code;
+
+  /* Allocate memory for the test. */
+  void* original = blosc_test_malloc(buffer_alignment, buffer_size);
+  void* shuffled = blosc_test_malloc(buffer_alignment, buffer_size);
+  void* unshuffled = blosc_test_malloc(buffer_alignment, buffer_size);
+
+  /* Fill the input data buffer with random values. */
+  blosc_test_fill_random(original, buffer_size);
+
+  /* Shuffle/unshuffle, selecting the implementations based on the test type. */
+  switch(test_type)
+  {
+    case 0:
+      /* sse2/sse2 */
+      shuffle_sse2(type_size, buffer_size, original, shuffled);
+      unshuffle_sse2(type_size, buffer_size, shuffled, unshuffled);
+      break;
+    case 1:
+      /* generic/sse2 */
+      shuffle_generic(type_size, buffer_size, original, shuffled);
+      unshuffle_sse2(type_size, buffer_size, shuffled, unshuffled);
+      break;
+    case 2:
+      /* sse2/generic */
+      shuffle_sse2(type_size, buffer_size, original, shuffled);
+      unshuffle_generic(type_size, buffer_size, shuffled, unshuffled);
+      break;
+    default:
+      fprintf(stderr, "Invalid test type specified (%d).", test_type);
+      return EXIT_FAILURE;
+  }
+
+  /* The round-tripped data matches the original data when the
+     result of memcmp is 0. */
+  exit_code = memcmp(original, unshuffled, buffer_size) ?
+    EXIT_FAILURE : EXIT_SUCCESS;
+
+  /* Free allocated memory. */
+  blosc_test_free(original);
+  blosc_test_free(shuffled);
+  blosc_test_free(unshuffled);
+
+  return exit_code;
+#else
+  return EXIT_SUCCESS;
+#endif /* defined(SHUFFLE_SSE2_ENABLED) */
+}
+
+
+/** Required number of arguments to this test, including the executable name. */
+#define TEST_ARG_COUNT  5
+
+int main(int argc, char **argv)
+{
+  uint32_t type_size;
+  uint32_t num_elements;
+  uint32_t buffer_align_size;
+  uint32_t test_type;
+
+  /*  argv[1]: sizeof(element type)
+      argv[2]: number of elements
+      argv[3]: buffer alignment
+      argv[4]: test type
+  */
+
+  /*  Verify the correct number of command-line args have been specified. */
+  if (TEST_ARG_COUNT != argc)
+  {
+    blosc_test_print_bad_argcount_msg(TEST_ARG_COUNT, argc);
+    return EXIT_FAILURE;
+  }
+
+  /* Parse arguments */
+  if (!blosc_test_parse_uint32_t(argv[1], &type_size) || (type_size < 1))
+  {
+    blosc_test_print_bad_arg_msg(1);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[2], &num_elements) || (num_elements < 1))
+  {
+    blosc_test_print_bad_arg_msg(2);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[3], &buffer_align_size)
+    || (buffer_align_size & (buffer_align_size - 1))
+    || (buffer_align_size < sizeof(void*)))
+  {
+    blosc_test_print_bad_arg_msg(3);
+    return EXIT_FAILURE;
+  }
+
+  if (!blosc_test_parse_uint32_t(argv[4], &test_type) || (test_type > 2))
+  {
+    blosc_test_print_bad_arg_msg(4);
+    return EXIT_FAILURE;
+  }
+
+  /* Run the test. */
+  return test_shuffle_roundtrip_sse2(type_size, num_elements, buffer_align_size, test_type);
+}
diff --git a/c-blosc/tests/test_shuffle_roundtrip_sse2.csv b/c-blosc/tests/test_shuffle_roundtrip_sse2.csv
new file mode 100644
index 000000000..ccff286e2
--- /dev/null
+++ b/c-blosc/tests/test_shuffle_roundtrip_sse2.csv
@@ -0,0 +1,400 @@
+"Size of element type (bytes)","Number of elements","Buffer alignment size (bytes)","Test type"
+1,7,32,0
+1,7,32,1
+1,7,32,2
+1,192,32,0
+1,192,32,1
+1,192,32,2
+1,1792,32,0
+1,1792,32,1
+1,1792,32,2
+1,500,32,0
+1,500,32,1
+1,500,32,2
+1,8000,32,0
+1,8000,32,1
+1,8000,32,2
+1,100000,32,0
+1,100000,32,1
+1,100000,32,2
+1,702713,32,0
+1,702713,32,1
+1,702713,32,2
+2,7,32,0
+2,7,32,1
+2,7,32,2
+2,192,32,0
+2,192,32,1
+2,192,32,2
+2,1792,32,0
+2,1792,32,1
+2,1792,32,2
+2,500,32,0
+2,500,32,1
+2,500,32,2
+2,8000,32,0
+2,8000,32,1
+2,8000,32,2
+2,100000,32,0
+2,100000,32,1
+2,100000,32,2
+2,702713,32,0
+2,702713,32,1
+2,702713,32,2
+3,7,32,0
+3,7,32,1
+3,7,32,2
+3,192,32,0
+3,192,32,1
+3,192,32,2
+3,1792,32,0
+3,1792,32,1
+3,1792,32,2
+3,500,32,0
+3,500,32,1
+3,500,32,2
+3,8000,32,0
+3,8000,32,1
+3,8000,32,2
+3,100000,32,0
+3,100000,32,1
+3,100000,32,2
+3,702713,32,0
+3,702713,32,1
+3,702713,32,2
+4,7,32,0
+4,7,32,1
+4,7,32,2
+4,192,32,0
+4,192,32,1
+4,192,32,2
+4,1792,32,0
+4,1792,32,1
+4,1792,32,2
+4,500,32,0
+4,500,32,1
+4,500,32,2
+4,8000,32,0
+4,8000,32,1
+4,8000,32,2
+4,100000,32,0
+4,100000,32,1
+4,100000,32,2
+4,702713,32,0
+4,702713,32,1
+4,702713,32,2
+5,7,32,0
+5,7,32,1
+5,7,32,2
+5,192,32,0
+5,192,32,1
+5,192,32,2
+5,1792,32,0
+5,1792,32,1
+5,1792,32,2
+5,500,32,0
+5,500,32,1
+5,500,32,2
+5,8000,32,0
+5,8000,32,1
+5,8000,32,2
+5,100000,32,0
+5,100000,32,1
+5,100000,32,2
+5,702713,32,0
+5,702713,32,1
+5,702713,32,2
+6,7,32,0
+6,7,32,1
+6,7,32,2
+6,192,32,0
+6,192,32,1
+6,192,32,2
+6,1792,32,0
+6,1792,32,1
+6,1792,32,2
+6,500,32,0
+6,500,32,1
+6,500,32,2
+6,8000,32,0
+6,8000,32,1
+6,8000,32,2
+6,100000,32,0
+6,100000,32,1
+6,100000,32,2
+6,702713,32,0
+6,702713,32,1
+6,702713,32,2
+7,7,32,0
+7,7,32,1
+7,7,32,2
+7,192,32,0
+7,192,32,1
+7,192,32,2
+7,1792,32,0
+7,1792,32,1
+7,1792,32,2
+7,500,32,0
+7,500,32,1
+7,500,32,2
+7,8000,32,0
+7,8000,32,1
+7,8000,32,2
+7,100000,32,0
+7,100000,32,1
+7,100000,32,2
+7,702713,32,0
+7,702713,32,1
+7,702713,32,2
+8,7,32,0
+8,7,32,1
+8,7,32,2
+8,192,32,0
+8,192,32,1
+8,192,32,2
+8,1792,32,0
+8,1792,32,1
+8,1792,32,2
+8,500,32,0
+8,500,32,1
+8,500,32,2
+8,8000,32,0
+8,8000,32,1
+8,8000,32,2
+8,100000,32,0
+8,100000,32,1
+8,100000,32,2
+8,702713,32,0
+8,702713,32,1
+8,702713,32,2
+11,7,32,0
+11,7,32,1
+11,7,32,2
+11,192,32,0
+11,192,32,1
+11,192,32,2
+11,1792,32,0
+11,1792,32,1
+11,1792,32,2
+11,500,32,0
+11,500,32,1
+11,500,32,2
+11,8000,32,0
+11,8000,32,1
+11,8000,32,2
+11,100000,32,0
+11,100000,32,1
+11,100000,32,2
+11,702713,32,0
+11,702713,32,1
+11,702713,32,2
+16,7,32,0
+16,7,32,1
+16,7,32,2
+16,192,32,0
+16,192,32,1
+16,192,32,2
+16,1792,32,0
+16,1792,32,1
+16,1792,32,2
+16,500,32,0
+16,500,32,1
+16,500,32,2
+16,8000,32,0
+16,8000,32,1
+16,8000,32,2
+16,100000,32,0
+16,100000,32,1
+16,100000,32,2
+16,702713,32,0
+16,702713,32,1
+16,702713,32,2
+22,7,32,0
+22,7,32,1
+22,7,32,2
+22,192,32,0
+22,192,32,1
+22,192,32,2
+22,1792,32,0
+22,1792,32,1
+22,1792,32,2
+22,500,32,0
+22,500,32,1
+22,500,32,2
+22,8000,32,0
+22,8000,32,1
+22,8000,32,2
+22,100000,32,0
+22,100000,32,1
+22,100000,32,2
+22,702713,32,0
+22,702713,32,1
+22,702713,32,2
+30,7,32,0
+30,7,32,1
+30,7,32,2
+30,192,32,0
+30,192,32,1
+30,192,32,2
+30,1792,32,0
+30,1792,32,1
+30,1792,32,2
+30,500,32,0
+30,500,32,1
+30,500,32,2
+30,8000,32,0
+30,8000,32,1
+30,8000,32,2
+30,100000,32,0
+30,100000,32,1
+30,100000,32,2
+30,702713,32,0
+30,702713,32,1
+30,702713,32,2
+32,7,32,0
+32,7,32,1
+32,7,32,2
+32,192,32,0
+32,192,32,1
+32,192,32,2
+32,1792,32,0
+32,1792,32,1
+32,1792,32,2
+32,500,32,0
+32,500,32,1
+32,500,32,2
+32,8000,32,0
+32,8000,32,1
+32,8000,32,2
+32,100000,32,0
+32,100000,32,1
+32,100000,32,2
+32,702713,32,0
+32,702713,32,1
+32,702713,32,2
+42,7,32,0
+42,7,32,1
+42,7,32,2
+42,192,32,0
+42,192,32,1
+42,192,32,2
+42,1792,32,0
+42,1792,32,1
+42,1792,32,2
+42,500,32,0
+42,500,32,1
+42,500,32,2
+42,8000,32,0
+42,8000,32,1
+42,8000,32,2
+42,100000,32,0
+42,100000,32,1
+42,100000,32,2
+42,702713,32,0
+42,702713,32,1
+42,702713,32,2
+48,7,32,0
+48,7,32,1
+48,7,32,2
+48,192,32,0
+48,192,32,1
+48,192,32,2
+48,1792,32,0
+48,1792,32,1
+48,1792,32,2
+48,500,32,0
+48,500,32,1
+48,500,32,2
+48,8000,32,0
+48,8000,32,1
+48,8000,32,2
+48,100000,32,0
+48,100000,32,1
+48,100000,32,2
+48,702713,32,0
+48,702713,32,1
+48,702713,32,2
+52,7,32,0
+52,7,32,1
+52,7,32,2
+52,192,32,0
+52,192,32,1
+52,192,32,2
+52,1792,32,0
+52,1792,32,1
+52,1792,32,2
+52,500,32,0
+52,500,32,1
+52,500,32,2
+52,8000,32,0
+52,8000,32,1
+52,8000,32,2
+52,100000,32,0
+52,100000,32,1
+52,100000,32,2
+52,702713,32,0
+52,702713,32,1
+52,702713,32,2
+53,7,32,0
+53,7,32,1
+53,7,32,2
+53,192,32,0
+53,192,32,1
+53,192,32,2
+53,1792,32,0
+53,1792,32,1
+53,1792,32,2
+53,500,32,0
+53,500,32,1
+53,500,32,2
+53,8000,32,0
+53,8000,32,1
+53,8000,32,2
+53,100000,32,0
+53,100000,32,1
+53,100000,32,2
+53,702713,32,0
+53,702713,32,1
+53,702713,32,2
+64,7,32,0
+64,7,32,1
+64,7,32,2
+64,192,32,0
+64,192,32,1
+64,192,32,2
+64,1792,32,0
+64,1792,32,1
+64,1792,32,2
+64,500,32,0
+64,500,32,1
+64,500,32,2
+64,8000,32,0
+64,8000,32,1
+64,8000,32,2
+64,100000,32,0
+64,100000,32,1
+64,100000,32,2
+64,702713,32,0
+64,702713,32,1
+64,702713,32,2
+80,7,32,0
+80,7,32,1
+80,7,32,2
+80,192,32,0
+80,192,32,1
+80,192,32,2
+80,1792,32,0
+80,1792,32,1
+80,1792,32,2
+80,500,32,0
+80,500,32,1
+80,500,32,2
+80,8000,32,0
+80,8000,32,1
+80,8000,32,2
+80,100000,32,0
+80,100000,32,1
+80,100000,32,2
+80,702713,32,0
+80,702713,32,1
+80,702713,32,2
\ No newline at end of file
diff --git a/cpuinfo.py b/cpuinfo.py
new file mode 100644
index 000000000..d0bcfa986
--- /dev/null
+++ b/cpuinfo.py
@@ -0,0 +1,1565 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2014-2016, Matthew Brennan Jones <matthew.brennan.jones@gmail.com>
+# Py-cpuinfo is a Python module to show the cpuinfo of a processor
+# It uses a MIT style license
+# It is hosted at: https://github.com/workhorsy/py-cpuinfo
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import os, sys
+import re
+import time
+import platform
+import multiprocessing
+import ctypes
+import pickle
+import base64
+import subprocess
+
+try:
+	import _winreg as winreg
+except ImportError as err:
+	try:
+		import winreg
+	except ImportError as err:
+		pass
+
+PY2 = sys.version_info[0] == 2
+
+
+class DataSource(object):
+	bits = platform.architecture()[0]
+	cpu_count = multiprocessing.cpu_count()
+	is_windows = platform.system().lower() == 'windows'
+	raw_arch_string = platform.machine()
+
+	@staticmethod
+	def has_proc_cpuinfo():
+		return os.path.exists('/proc/cpuinfo')
+
+	@staticmethod
+	def has_dmesg():
+		return len(program_paths('dmesg')) > 0
+
+	@staticmethod
+	def has_cpufreq_info():
+		return len(program_paths('cpufreq-info')) > 0
+
+	@staticmethod
+	def has_sestatus():
+		return len(program_paths('sestatus')) > 0
+
+	@staticmethod
+	def has_sysctl():
+		return len(program_paths('sysctl')) > 0
+
+	@staticmethod
+	def has_isainfo():
+		return len(program_paths('isainfo')) > 0
+
+	@staticmethod
+	def has_kstat():
+		return len(program_paths('kstat')) > 0
+
+	@staticmethod
+	def has_sysinfo():
+		return len(program_paths('sysinfo')) > 0
+
+	@staticmethod
+	def has_lscpu():
+		return len(program_paths('lscpu')) > 0
+
+	@staticmethod
+	def cat_proc_cpuinfo():
+		return run_and_get_stdout(['cat', '/proc/cpuinfo'])
+
+	@staticmethod
+	def cpufreq_info():
+		return run_and_get_stdout(['cpufreq-info'])
+
+	@staticmethod
+	def sestatus_allow_execheap():
+		return run_and_get_stdout(['sestatus', '-b'], ['grep', '-i', '"allow_execheap"'])[1].strip().lower().endswith('on')
+
+	@staticmethod
+	def sestatus_allow_execmem():
+		return run_and_get_stdout(['sestatus', '-b'], ['grep', '-i', '"allow_execmem"'])[1].strip().lower().endswith('on')
+
+	@staticmethod
+	def dmesg_a():
+		return run_and_get_stdout(['dmesg', '-a'])
+
+	@staticmethod
+	def sysctl_machdep_cpu_hw_cpufrequency():
+		return run_and_get_stdout(['sysctl', 'machdep.cpu', 'hw.cpufrequency'])
+
+	@staticmethod
+	def isainfo_vb():
+		return run_and_get_stdout(['isainfo', '-vb'])
+
+	@staticmethod
+	def kstat_m_cpu_info():
+		return run_and_get_stdout(['kstat', '-m', 'cpu_info'])
+
+	@staticmethod
+	def sysinfo_cpu():
+		return run_and_get_stdout(['sysinfo', '-cpu'])
+
+	@staticmethod
+	def lscpu():
+		return run_and_get_stdout(['lscpu'])
+
+	@staticmethod
+	def winreg_processor_brand():
+		key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"Hardware\Description\System\CentralProcessor\0")
+		processor_brand = winreg.QueryValueEx(key, "ProcessorNameString")[0]
+		winreg.CloseKey(key)
+		return processor_brand
+
+	@staticmethod
+	def winreg_vendor_id():
+		key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"Hardware\Description\System\CentralProcessor\0")
+		vendor_id = winreg.QueryValueEx(key, "VendorIdentifier")[0]
+		winreg.CloseKey(key)
+		return vendor_id
+
+	@staticmethod
+	def winreg_raw_arch_string():
+		key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"SYSTEM\CurrentControlSet\Control\Session Manager\Environment")
+		raw_arch_string = winreg.QueryValueEx(key, "PROCESSOR_ARCHITECTURE")[0]
+		winreg.CloseKey(key)
+		return raw_arch_string
+
+	@staticmethod
+	def winreg_hz_actual():
+		key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"Hardware\Description\System\CentralProcessor\0")
+		hz_actual = winreg.QueryValueEx(key, "~Mhz")[0]
+		winreg.CloseKey(key)
+		hz_actual = to_hz_string(hz_actual)
+		return hz_actual
+
+	@staticmethod
+	def winreg_feature_bits():
+		key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"Hardware\Description\System\CentralProcessor\0")
+		feature_bits = winreg.QueryValueEx(key, "FeatureSet")[0]
+		winreg.CloseKey(key)
+		return feature_bits
+
+def obj_to_b64(thing):
+	a = thing
+	b = pickle.dumps(a)
+	c = base64.b64encode(b)
+	d = c.decode('utf8')
+	return d
+
+def b64_to_obj(thing):
+	a = base64.b64decode(thing)
+	b = pickle.loads(a)
+	return b
+
+def run_and_get_stdout(command, pipe_command=None):
+	if not pipe_command:
+		p1 = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+		output = p1.communicate()[0]
+		if not PY2:
+			output = output.decode(encoding='UTF-8')
+		return p1.returncode, output
+	else:
+		p1 = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+		p2 = subprocess.Popen(pipe_command, stdin=p1.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+		p1.stdout.close()
+		output = p2.communicate()[0]
+		if not PY2:
+			output = output.decode(encoding='UTF-8')
+		return p2.returncode, output
+
+
+def program_paths(program_name):
+	paths = []
+	exts = filter(None, os.environ.get('PATHEXT', '').split(os.pathsep))
+	path = os.environ['PATH']
+	for p in os.environ['PATH'].split(os.pathsep):
+		p = os.path.join(p, program_name)
+		if os.access(p, os.X_OK):
+			paths.append(p)
+		for e in exts:
+			pext = p + e
+			if os.access(pext, os.X_OK):
+				paths.append(pext)
+	return paths
+
+def _get_field_actual(cant_be_number, raw_string, field_names):
+	for line in raw_string.splitlines():
+		for field_name in field_names:
+			field_name = field_name.lower()
+			if ':' in line:
+				left, right = line.split(':', 1)
+				left = left.strip().lower()
+				right = right.strip()
+				if left == field_name and len(right) > 0:
+					if cant_be_number:
+						if not right.isdigit():
+							return right
+					else:
+						return right
+
+	return None
+
+def _get_field(cant_be_number, raw_string, convert_to, default_value, *field_names):
+	retval = _get_field_actual(cant_be_number, raw_string, field_names)
+
+	# Convert the return value
+	if retval and convert_to:
+		try:
+			retval = convert_to(retval)
+		except:
+			retval = default_value
+
+	# Return the default if there is no return value
+	if retval is None:
+		retval = default_value
+
+	return retval
+
+def _get_hz_string_from_brand(processor_brand):
+	# Just return 0 if the processor brand does not have the Hz
+	if not 'hz' in processor_brand.lower():
+		return (1, '0.0')
+
+	hz_brand = processor_brand.lower()
+	scale = 1
+
+	if hz_brand.endswith('mhz'):
+		scale = 6
+	elif hz_brand.endswith('ghz'):
+		scale = 9
+	if '@' in hz_brand:
+		hz_brand = hz_brand.split('@')[1]
+	else:
+		hz_brand = hz_brand.rsplit(None, 1)[1]
+
+	hz_brand = hz_brand.rstrip('mhz').rstrip('ghz').strip()
+	hz_brand = to_hz_string(hz_brand)
+
+	return (scale, hz_brand)
+
+def _get_hz_string_from_beagle_bone():
+	scale, hz_brand = 1, '0.0'
+
+	if not DataSource.has_cpufreq_info():
+		return scale, hz_brand
+
+	returncode, output = DataSource.cpufreq_info()
+	if returncode != 0:
+		return (scale, hz_brand)
+
+	hz_brand = output.split('current CPU frequency is')[1].split('.')[0].lower()
+
+	if hz_brand.endswith('mhz'):
+		scale = 6
+	elif hz_brand.endswith('ghz'):
+		scale = 9
+	hz_brand = hz_brand.rstrip('mhz').rstrip('ghz').strip()
+	hz_brand = to_hz_string(hz_brand)
+
+	return (scale, hz_brand)
+
+def _get_hz_string_from_lscpu():
+	scale, hz_brand = 1, '0.0'
+
+	if not DataSource.has_lscpu():
+		return scale, hz_brand
+
+	returncode, output = DataSource.lscpu()
+	if returncode != 0:
+		return (scale, hz_brand)
+
+	new_hz = _get_field(False, output, None, None, 'CPU max MHz', 'CPU MHz')
+	if new_hz == None:
+		return (scale, hz_brand)
+
+	new_hz = to_hz_string(new_hz)
+	scale = 6
+
+	return (scale, new_hz)
+
+def to_friendly_hz(ticks, scale):
+	# Get the raw Hz as a string
+	left, right = to_raw_hz(ticks, scale)
+	ticks = '{0}.{1}'.format(left, right)
+
+	# Get the location of the dot, and remove said dot
+	dot_index = ticks.index('.')
+	ticks = ticks.replace('.', '')
+
+	# Get the Hz symbol and scale
+	symbol = "Hz"
+	scale = 0
+	if dot_index > 9:
+		symbol = "GHz"
+		scale = 9
+	elif dot_index > 6:
+		symbol = "MHz"
+		scale = 6
+	elif dot_index > 3:
+		symbol = "KHz"
+		scale = 3
+
+	# Get the Hz with the dot at the new scaled point
+	ticks = '{0}.{1}'.format(ticks[:-scale-1], ticks[-scale-1:])
+
+	# Format the ticks to have 4 numbers after the decimal
+	# and remove any superfluous zeroes.
+	ticks = '{0:.4f} {1}'.format(float(ticks), symbol)
+	ticks = ticks.rstrip('0')
+
+	return ticks
+
+def to_raw_hz(ticks, scale):
+	# Scale the numbers
+	ticks = ticks.lstrip('0')
+	old_index = ticks.index('.')
+	ticks = ticks.replace('.', '')
+	ticks = ticks.ljust(scale + old_index+1, '0')
+	new_index = old_index + scale
+	ticks = '{0}.{1}'.format(ticks[:new_index], ticks[new_index:])
+	left, right = ticks.split('.')
+	left, right = int(left), int(right)
+	return (left, right)
+
+def to_hz_string(ticks):
+	# Convert to string
+	ticks = '{0}'.format(ticks)
+
+	# Add decimal if missing
+	if '.' not in ticks:
+		ticks = '{0}.0'.format(ticks)
+
+	# Remove trailing zeros
+	ticks = ticks.rstrip('0')
+
+	# Add one trailing zero for empty right side
+	if ticks.endswith('.'):
+		ticks = '{0}0'.format(ticks)
+
+	return ticks
+
+def parse_arch(raw_arch_string):
+	arch, bits = None, None
+	raw_arch_string = raw_arch_string.lower()
+
+	# X86
+	if re.match('^i\d86$|^x86$|^x86_32$|^i86pc$|^ia32$|^ia-32$|^bepc$', raw_arch_string):
+		arch = 'X86_32'
+		bits = 32
+	elif re.match('^x64$|^x86_64$|^x86_64t$|^i686-64$|^amd64$|^ia64$|^ia-64$', raw_arch_string):
+		arch = 'X86_64'
+		bits = 64
+	# ARM
+	elif re.match('^armv8-a$', raw_arch_string):
+		arch = 'ARM_8'
+		bits = 64
+	elif re.match('^armv7$|^armv7[a-z]$|^armv7-[a-z]$|^armv6[a-z]$', raw_arch_string):
+		arch = 'ARM_7'
+		bits = 32
+	elif re.match('^armv8$|^armv8[a-z]$|^armv8-[a-z]$', raw_arch_string):
+		arch = 'ARM_8'
+		bits = 32
+	# PPC
+	elif re.match('^ppc32$|^prep$|^pmac$|^powermac$', raw_arch_string):
+		arch = 'PPC_32'
+		bits = 32
+	elif re.match('^powerpc$|^ppc64$', raw_arch_string):
+		arch = 'PPC_64'
+		bits = 64
+	# SPARC
+	elif re.match('^sparc32$|^sparc$', raw_arch_string):
+		arch = 'SPARC_32'
+		bits = 32
+	elif re.match('^sparc64$|^sun4u$|^sun4v$', raw_arch_string):
+		arch = 'SPARC_64'
+		bits = 64
+
+	return (arch, bits)
+
+def is_bit_set(reg, bit):
+	mask = 1 << bit
+	is_set = reg & mask > 0
+	return is_set
+
+
+class CPUID(object):
+	def __init__(self):
+		# Figure out if SE Linux is on and in enforcing mode
+		self.is_selinux_enforcing = False
+
+		# Just return if the SE Linux Status Tool is not installed
+		if not DataSource.has_sestatus():
+			return
+
+		# Figure out if we can execute heap and execute memory
+		can_selinux_exec_heap = DataSource.sestatus_allow_execheap()
+		can_selinux_exec_memory = DataSource.sestatus_allow_execmem()
+		self.is_selinux_enforcing = (not can_selinux_exec_heap or not can_selinux_exec_memory)
+
+	def _asm_func(self, restype=None, argtypes=(), byte_code=[]):
+		byte_code = bytes.join(b'', byte_code)
+		address = None
+
+		if DataSource.is_windows:
+			# Allocate a memory segment the size of the byte code, and make it executable
+			size = len(byte_code)
+			MEM_COMMIT = ctypes.c_ulong(0x1000)
+			PAGE_EXECUTE_READWRITE = ctypes.c_ulong(0x40)
+			address = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0), ctypes.c_size_t(size), MEM_COMMIT, PAGE_EXECUTE_READWRITE)
+			if not address:
+				raise Exception("Failed to VirtualAlloc")
+
+			# Copy the byte code into the memory segment
+			memmove = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t)(ctypes._memmove_addr)
+			if memmove(address, byte_code, size) < 0:
+				raise Exception("Failed to memmove")
+		else:
+			# Allocate a memory segment the size of the byte code
+			size = len(byte_code)
+			address = ctypes.pythonapi.valloc(size)
+			if not address:
+				raise Exception("Failed to valloc")
+
+			# Mark the memory segment as writeable only
+			if not self.is_selinux_enforcing:
+				WRITE = 0x2
+				if ctypes.pythonapi.mprotect(address, size, WRITE) < 0:
+					raise Exception("Failed to mprotect")
+
+			# Copy the byte code into the memory segment
+			if ctypes.pythonapi.memmove(address, byte_code, size) < 0:
+				raise Exception("Failed to memmove")
+
+			# Mark the memory segment as writeable and executable only
+			if not self.is_selinux_enforcing:
+				WRITE_EXECUTE = 0x2 | 0x4
+				if ctypes.pythonapi.mprotect(address, size, WRITE_EXECUTE) < 0:
+					raise Exception("Failed to mprotect")
+
+		# Cast the memory segment into a function
+		functype = ctypes.CFUNCTYPE(restype, *argtypes)
+		fun = functype(address)
+		return fun, address
+
+	def _run_asm(self, *byte_code):
+		# Convert the byte code into a function that returns an int
+		restype = None
+		if DataSource.bits == '64bit':
+			restype = ctypes.c_uint64
+		else:
+			restype = ctypes.c_uint32
+		argtypes = ()
+		func, address = self._asm_func(restype, argtypes, byte_code)
+
+		# Call the byte code like a function
+		retval = func()
+
+		size = ctypes.c_size_t(len(byte_code))
+
+		# Free the function memory segment
+		if DataSource.is_windows:
+			MEM_RELEASE = ctypes.c_ulong(0x8000)
+			ctypes.windll.kernel32.VirtualFree(address, size, MEM_RELEASE)
+		else:
+			# Remove the executable tag on the memory
+			READ_WRITE = 0x1 | 0x2
+			if ctypes.pythonapi.mprotect(address, size, READ_WRITE) < 0:
+				raise Exception("Failed to mprotect")
+
+			ctypes.pythonapi.free(address)
+
+		return retval
+
+	# FIXME: We should not have to use different instructions to
+	# set eax to 0 or 1, on 32bit and 64bit machines.
+	def _zero_eax(self):
+		if DataSource.bits == '64bit':
+			return (
+				b"\x66\xB8\x00\x00" # mov eax,0x0"
+			)
+		else:
+			return (
+				b"\x31\xC0"         # xor ax,ax
+			)
+
+	def _one_eax(self):
+		if DataSource.bits == '64bit':
+			return (
+				b"\x66\xB8\x01\x00" # mov eax,0x1"
+			)
+		else:
+			return (
+				b"\x31\xC0"         # xor ax,ax
+				b"\x40"             # inc ax
+			)
+
+	# http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
+	def get_vendor_id(self):
+		# EBX
+		ebx = self._run_asm(
+			self._zero_eax(),
+			b"\x0F\xA2"         # cpuid
+			b"\x89\xD8"         # mov ax,bx
+			b"\xC3"             # ret
+		)
+
+		# ECX
+		ecx = self._run_asm(
+			self._zero_eax(),
+			b"\x0f\xa2"         # cpuid
+			b"\x89\xC8"         # mov ax,cx
+			b"\xC3"             # ret
+		)
+
+		# EDX
+		edx = self._run_asm(
+			self._zero_eax(),
+			b"\x0f\xa2"         # cpuid
+			b"\x89\xD0"         # mov ax,dx
+			b"\xC3"             # ret
+		)
+
+		# Each 4bits is a ascii letter in the name
+		vendor_id = []
+		for reg in [ebx, edx, ecx]:
+			for n in [0, 8, 16, 24]:
+				vendor_id.append(chr((reg >> n) & 0xFF))
+		vendor_id = ''.join(vendor_id)
+
+		return vendor_id
+
+	# http://en.wikipedia.org/wiki/CPUID#EAX.3D1:_Processor_Info_and_Feature_Bits
+	def get_info(self):
+		# EAX
+		eax = self._run_asm(
+			self._one_eax(),
+			b"\x0f\xa2"         # cpuid
+			b"\xC3"             # ret
+		)
+
+		# Get the CPU info
+		stepping = (eax >> 0) & 0xF # 4 bits
+		model = (eax >> 4) & 0xF # 4 bits
+		family = (eax >> 8) & 0xF # 4 bits
+		processor_type = (eax >> 12) & 0x3 # 2 bits
+		extended_model = (eax >> 16) & 0xF # 4 bits
+		extended_family = (eax >> 20) & 0xFF # 8 bits
+
+		return {
+			'stepping' : stepping,
+			'model' : model,
+			'family' : family,
+			'processor_type' : processor_type,
+			'extended_model' : extended_model,
+			'extended_family' : extended_family
+		}
+
+	# https://en.wikipedia.org/wiki/CPUID#EAX.3D80000000h:_Get_Highest_Extended_Function_Supported
+	def get_max_extension_support(self):
+		# Check for extension support
+		max_extension_support = self._run_asm(
+			b"\xB8\x00\x00\x00\x80" # mov ax,0x80000000
+			b"\x0f\xa2"             # cpuid
+			b"\xC3"                 # ret
+		)
+
+		return max_extension_support
+
+	# http://en.wikipedia.org/wiki/CPUID#EAX.3D1:_Processor_Info_and_Feature_Bits
+	def get_flags(self, max_extension_support):
+		# EDX
+		edx = self._run_asm(
+			self._one_eax(),
+			b"\x0f\xa2"         # cpuid
+			b"\x89\xD0"         # mov ax,dx
+			b"\xC3"             # ret
+		)
+
+		# ECX
+		ecx = self._run_asm(
+			self._one_eax(),
+			b"\x0f\xa2"         # cpuid
+			b"\x89\xC8"         # mov ax,cx
+			b"\xC3"             # ret
+		)
+
+		# Get the CPU flags
+		flags = {
+			'fpu' : is_bit_set(edx, 0),
+			'vme' : is_bit_set(edx, 1),
+			'de' : is_bit_set(edx, 2),
+			'pse' : is_bit_set(edx, 3),
+			'tsc' : is_bit_set(edx, 4),
+			'msr' : is_bit_set(edx, 5),
+			'pae' : is_bit_set(edx, 6),
+			'mce' : is_bit_set(edx, 7),
+			'cx8' : is_bit_set(edx, 8),
+			'apic' : is_bit_set(edx, 9),
+			#'reserved1' : is_bit_set(edx, 10),
+			'sep' : is_bit_set(edx, 11),
+			'mtrr' : is_bit_set(edx, 12),
+			'pge' : is_bit_set(edx, 13),
+			'mca' : is_bit_set(edx, 14),
+			'cmov' : is_bit_set(edx, 15),
+			'pat' : is_bit_set(edx, 16),
+			'pse36' : is_bit_set(edx, 17),
+			'pn' : is_bit_set(edx, 18),
+			'clflush' : is_bit_set(edx, 19),
+			#'reserved2' : is_bit_set(edx, 20),
+			'dts' : is_bit_set(edx, 21),
+			'acpi' : is_bit_set(edx, 22),
+			'mmx' : is_bit_set(edx, 23),
+			'fxsr' : is_bit_set(edx, 24),
+			'sse' : is_bit_set(edx, 25),
+			'sse2' : is_bit_set(edx, 26),
+			'ss' : is_bit_set(edx, 27),
+			'ht' : is_bit_set(edx, 28),
+			'tm' : is_bit_set(edx, 29),
+			'ia64' : is_bit_set(edx, 30),
+			'pbe' : is_bit_set(edx, 31),
+
+			'pni' : is_bit_set(ecx, 0),
+			'pclmulqdq' : is_bit_set(ecx, 1),
+			'dtes64' : is_bit_set(ecx, 2),
+			'monitor' : is_bit_set(ecx, 3),
+			'ds_cpl' : is_bit_set(ecx, 4),
+			'vmx' : is_bit_set(ecx, 5),
+			'smx' : is_bit_set(ecx, 6),
+			'est' : is_bit_set(ecx, 7),
+			'tm2' : is_bit_set(ecx, 8),
+			'ssse3' : is_bit_set(ecx, 9),
+			'cid' : is_bit_set(ecx, 10),
+			#'reserved3' : is_bit_set(ecx, 11),
+			'fma' : is_bit_set(ecx, 12),
+			'cx16' : is_bit_set(ecx, 13),
+			'xtpr' : is_bit_set(ecx, 14),
+			'pdcm' : is_bit_set(ecx, 15),
+			#'reserved4' : is_bit_set(ecx, 16),
+			'pcid' : is_bit_set(ecx, 17),
+			'dca' : is_bit_set(ecx, 18),
+			'sse4_1' : is_bit_set(ecx, 19),
+			'sse4_2' : is_bit_set(ecx, 20),
+			'x2apic' : is_bit_set(ecx, 21),
+			'movbe' : is_bit_set(ecx, 22),
+			'popcnt' : is_bit_set(ecx, 23),
+			'tscdeadline' : is_bit_set(ecx, 24),
+			'aes' : is_bit_set(ecx, 25),
+			'xsave' : is_bit_set(ecx, 26),
+			'osxsave' : is_bit_set(ecx, 27),
+			'avx' : is_bit_set(ecx, 28),
+			'f16c' : is_bit_set(ecx, 29),
+			'rdrnd' : is_bit_set(ecx, 30),
+			'hypervisor' : is_bit_set(ecx, 31)
+		}
+
+		# Get a list of only the flags that are true
+		flags = [k for k, v in flags.items() if v]
+
+		# Get the Extended CPU flags
+		extended_flags = {}
+
+		# https://en.wikipedia.org/wiki/CPUID#EAX.3D7.2C_ECX.3D0:_Extended_Features
+		if max_extension_support == 7:
+			pass
+			# FIXME: Are we missing all these flags too?
+			# avx2 et cetera ...
+
+		# https://en.wikipedia.org/wiki/CPUID#EAX.3D80000001h:_Extended_Processor_Info_and_Feature_Bits
+		if max_extension_support >= 0x80000001:
+			# EBX # FIXME: This may need to be EDX instead
+			ebx = self._run_asm(
+				b"\xB8\x01\x00\x00\x80" # mov ax,0x80000001
+				b"\x0f\xa2"         # cpuid
+				b"\x89\xD8"         # mov ax,bx
+				b"\xC3"             # ret
+			)
+
+			# ECX
+			ecx = self._run_asm(
+				b"\xB8\x01\x00\x00\x80" # mov ax,0x80000001
+				b"\x0f\xa2"         # cpuid
+				b"\x89\xC8"         # mov ax,cx
+				b"\xC3"             # ret
+			)
+
+			# Get the extended CPU flags
+			extended_flags = {
+				'fpu' : is_bit_set(ebx, 0),
+				'vme' : is_bit_set(ebx, 1),
+				'de' : is_bit_set(ebx, 2),
+				'pse' : is_bit_set(ebx, 3),
+				'tsc' : is_bit_set(ebx, 4),
+				'msr' : is_bit_set(ebx, 5),
+				'pae' : is_bit_set(ebx, 6),
+				'mce' : is_bit_set(ebx, 7),
+				'cx8' : is_bit_set(ebx, 8),
+				'apic' : is_bit_set(ebx, 9),
+				#'reserved' : is_bit_set(ebx, 10),
+				'syscall' : is_bit_set(ebx, 11),
+				'mtrr' : is_bit_set(ebx, 12),
+				'pge' : is_bit_set(ebx, 13),
+				'mca' : is_bit_set(ebx, 14),
+				'cmov' : is_bit_set(ebx, 15),
+				'pat' : is_bit_set(ebx, 16),
+				'pse36' : is_bit_set(ebx, 17),
+				#'reserved' : is_bit_set(ebx, 18),
+				'mp' : is_bit_set(ebx, 19),
+				'nx' : is_bit_set(ebx, 20),
+				#'reserved' : is_bit_set(ebx, 21),
+				'mmxext' : is_bit_set(ebx, 22),
+				'mmx' : is_bit_set(ebx, 23),
+				'fxsr' : is_bit_set(ebx, 24),
+				'fxsr_opt' : is_bit_set(ebx, 25),
+				'pdpe1gp' : is_bit_set(ebx, 26),
+				'rdtscp' : is_bit_set(ebx, 27),
+				#'reserved' : is_bit_set(ebx, 28),
+				'lm' : is_bit_set(ebx, 29),
+				'3dnowext' : is_bit_set(ebx, 30),
+				'3dnow' : is_bit_set(ebx, 31),
+
+				'lahf_lm' : is_bit_set(ecx, 0),
+				'cmp_legacy' : is_bit_set(ecx, 1),
+				'svm' : is_bit_set(ecx, 2),
+				'extapic' : is_bit_set(ecx, 3),
+				'cr8_legacy' : is_bit_set(ecx, 4),
+				'abm' : is_bit_set(ecx, 5),
+				'sse4a' : is_bit_set(ecx, 6),
+				'misalignsse' : is_bit_set(ecx, 7),
+				'3dnowprefetch' : is_bit_set(ecx, 8),
+				'osvw' : is_bit_set(ecx, 9),
+				'ibs' : is_bit_set(ecx, 10),
+				'xop' : is_bit_set(ecx, 11),
+				'skinit' : is_bit_set(ecx, 12),
+				'wdt' : is_bit_set(ecx, 13),
+				#'reserved' : is_bit_set(ecx, 14),
+				'lwp' : is_bit_set(ecx, 15),
+				'fma4' : is_bit_set(ecx, 16),
+				'tce' : is_bit_set(ecx, 17),
+				#'reserved' : is_bit_set(ecx, 18),
+				'nodeid_msr' : is_bit_set(ecx, 19),
+				#'reserved' : is_bit_set(ecx, 20),
+				'tbm' : is_bit_set(ecx, 21),
+				'topoext' : is_bit_set(ecx, 22),
+				'perfctr_core' : is_bit_set(ecx, 23),
+				'perfctr_nb' : is_bit_set(ecx, 24),
+				#'reserved' : is_bit_set(ecx, 25),
+				'dbx' : is_bit_set(ecx, 26),
+				'perftsc' : is_bit_set(ecx, 27),
+				'pci_l2i' : is_bit_set(ecx, 28),
+				#'reserved' : is_bit_set(ecx, 29),
+				#'reserved' : is_bit_set(ecx, 30),
+				#'reserved' : is_bit_set(ecx, 31)
+			}
+
+		# Get a list of only the flags that are true
+		extended_flags = [k for k, v in extended_flags.items() if v]
+		flags += extended_flags
+
+		flags.sort()
+		return flags
+
+	# https://en.wikipedia.org/wiki/CPUID#EAX.3D80000002h.2C80000003h.2C80000004h:_Processor_Brand_String
+	def get_processor_brand(self, max_extension_support):
+		processor_brand = ""
+
+		# Processor brand string
+		if max_extension_support >= 0x80000004:
+			instructions = [
+				b"\xB8\x02\x00\x00\x80", # mov ax,0x80000002
+				b"\xB8\x03\x00\x00\x80", # mov ax,0x80000003
+				b"\xB8\x04\x00\x00\x80"  # mov ax,0x80000004
+			]
+			for instruction in instructions:
+				# EAX
+				eax = self._run_asm(
+					instruction,  # mov ax,0x8000000?
+					b"\x0f\xa2"   # cpuid
+					b"\x89\xC0"   # mov ax,ax
+					b"\xC3"       # ret
+				)
+
+				# EBX
+				ebx = self._run_asm(
+					instruction,  # mov ax,0x8000000?
+					b"\x0f\xa2"   # cpuid
+					b"\x89\xD8"   # mov ax,bx
+					b"\xC3"       # ret
+				)
+
+				# ECX
+				ecx = self._run_asm(
+					instruction,  # mov ax,0x8000000?
+					b"\x0f\xa2"   # cpuid
+					b"\x89\xC8"   # mov ax,cx
+					b"\xC3"       # ret
+				)
+
+				# EDX
+				edx = self._run_asm(
+					instruction,  # mov ax,0x8000000?
+					b"\x0f\xa2"   # cpuid
+					b"\x89\xD0"   # mov ax,dx
+					b"\xC3"       # ret
+				)
+
+				# Combine each of the 4 bytes in each register into the string
+				for reg in [eax, ebx, ecx, edx]:
+					for n in [0, 8, 16, 24]:
+						processor_brand += chr((reg >> n) & 0xFF)
+
+		# Strip off any trailing NULL terminators and white space
+		processor_brand = processor_brand.strip("\0").strip()
+
+		return processor_brand
+
+	# https://en.wikipedia.org/wiki/CPUID#EAX.3D80000006h:_Extended_L2_Cache_Features
+	def get_cache(self, max_extension_support):
+		cache_info = {}
+
+		# Just return if the cache feature is not supported
+		if max_extension_support < 0x80000006:
+			return cache_info
+
+		# ECX
+		ecx = self._run_asm(
+			b"\xB8\x06\x00\x00\x80"  # mov ax,0x80000006
+			b"\x0f\xa2"              # cpuid
+			b"\x89\xC8"              # mov ax,cx
+			b"\xC3"                   # ret
+		)
+
+		cache_info = {
+			'size_kb' : ecx & 0xFF,
+			'line_size_b' : (ecx >> 12) & 0xF,
+			'associativity' : (ecx >> 16) & 0xFFFF
+		}
+
+		return cache_info
+
+	def get_ticks(self):
+		retval = None
+
+		if DataSource.bits == '32bit':
+			# Works on x86_32
+			restype = None
+			argtypes = (ctypes.POINTER(ctypes.c_uint), ctypes.POINTER(ctypes.c_uint))
+			get_ticks_x86_32, address = self._asm_func(restype, argtypes,
+				[
+				b"\x55",         # push bp
+				b"\x89\xE5",     # mov bp,sp
+				b"\x31\xC0",     # xor ax,ax
+				b"\x0F\xA2",     # cpuid
+				b"\x0F\x31",     # rdtsc
+				b"\x8B\x5D\x08", # mov bx,[di+0x8]
+				b"\x8B\x4D\x0C", # mov cx,[di+0xc]
+				b"\x89\x13",     # mov [bp+di],dx
+				b"\x89\x01",     # mov [bx+di],ax
+				b"\x5D",         # pop bp
+				b"\xC3"          # ret
+				]
+			)
+
+			high = ctypes.c_uint32(0)
+			low = ctypes.c_uint32(0)
+
+			get_ticks_x86_32(ctypes.byref(high), ctypes.byref(low))
+			retval = ((high.value << 32) & 0xFFFFFFFF00000000) | low.value
+		elif DataSource.bits == '64bit':
+			# Works on x86_64
+			restype = ctypes.c_uint64
+			argtypes = ()
+			get_ticks_x86_64, address = self._asm_func(restype, argtypes,
+				[
+				b"\x48",         # dec ax
+				b"\x31\xC0",     # xor ax,ax
+				b"\x0F\xA2",     # cpuid
+				b"\x0F\x31",     # rdtsc
+				b"\x48",         # dec ax
+				b"\xC1\xE2\x20", # shl dx,byte 0x20
+				b"\x48",         # dec ax
+				b"\x09\xD0",     # or ax,dx
+				b"\xC3",         # ret
+				]
+			)
+			retval = get_ticks_x86_64()
+
+		return retval
+
+	def get_raw_hz(self):
+		start = self.get_ticks()
+
+		time.sleep(1)
+
+		end = self.get_ticks()
+
+		ticks = (end - start)
+
+		return ticks
+
+def get_cpu_info_from_cpuid():
+	'''
+	Returns the CPU info gathered by querying the X86 cpuid register in a new process.
+	Returns None of non X86 cpus.
+	Returns None if SELinux is in enforcing mode.
+	'''
+
+	returncode, output = run_and_get_stdout([sys.executable, "-c", "import cpuinfo; print(cpuinfo.actual_get_cpu_info_from_cpuid())"])
+	if returncode != 0:
+		return None
+	info = b64_to_obj(output)
+	return info
+
+def actual_get_cpu_info_from_cpuid():
+	# Get the CPU arch and bits
+	arch, bits = parse_arch(DataSource.raw_arch_string)
+
+	# Return none if this is not an X86 CPU
+	if not arch in ['X86_32', 'X86_64']:
+		return None
+
+	# Return none if SE Linux is in enforcing mode
+	cpuid = CPUID()
+	if cpuid.is_selinux_enforcing:
+		return None
+
+	# Get the cpu info from the CPUID register
+	max_extension_support = cpuid.get_max_extension_support()
+	cache_info = cpuid.get_cache(max_extension_support)
+	info = cpuid.get_info()
+
+	processor_brand = cpuid.get_processor_brand(max_extension_support)
+
+	# Get the Hz and scale
+	hz_actual = cpuid.get_raw_hz()
+	hz_actual = to_hz_string(hz_actual)
+
+	# Get the Hz and scale
+	scale, hz_advertised = _get_hz_string_from_brand(processor_brand)
+
+	info = {
+	'vendor_id' : cpuid.get_vendor_id(),
+	'hardware' : '',
+	'brand' : processor_brand,
+
+	'hz_advertised' : to_friendly_hz(hz_advertised, scale),
+	'hz_actual' : to_friendly_hz(hz_actual, 6),
+	'hz_advertised_raw' : to_raw_hz(hz_advertised, scale),
+	'hz_actual_raw' : to_raw_hz(hz_actual, 6),
+
+	'arch' : arch,
+	'bits' : bits,
+	'count' : DataSource.cpu_count,
+	'raw_arch_string' : DataSource.raw_arch_string,
+
+	'l2_cache_size' : cache_info['size_kb'],
+	'l2_cache_line_size' : cache_info['line_size_b'],
+	'l2_cache_associativity' : hex(cache_info['associativity']),
+
+	'stepping' : info['stepping'],
+	'model' : info['model'],
+	'family' : info['family'],
+	'processor_type' : info['processor_type'],
+	'extended_model' : info['extended_model'],
+	'extended_family' : info['extended_family'],
+	'flags' : cpuid.get_flags(max_extension_support)
+	}
+	return obj_to_b64(info)
+
+def get_cpu_info_from_proc_cpuinfo():
+	'''
+	Returns the CPU info gathered from /proc/cpuinfo. Will return None if
+	/proc/cpuinfo is not found.
+	'''
+	try:
+		# Just return None if there is no cpuinfo
+		if not DataSource.has_proc_cpuinfo():
+			return None
+
+		returncode, output = DataSource.cat_proc_cpuinfo()
+		if returncode != 0:
+			return None
+
+		# Various fields
+		vendor_id = _get_field(False, output, None, '', 'vendor_id', 'vendor id', 'vendor')
+		processor_brand = _get_field(True, output, None, None, 'model name','cpu', 'processor')
+		cache_size = _get_field(False, output, None, '', 'cache size')
+		stepping = _get_field(False, output, int, 0, 'stepping')
+		model = _get_field(False, output, int, 0, 'model')
+		family = _get_field(False, output, int, 0, 'cpu family')
+		hardware = _get_field(False, output, None, '', 'Hardware')
+		# Flags
+		flags = _get_field(False, output, None, None, 'flags', 'Features').split()
+		flags.sort()
+
+		# Convert from MHz string to Hz
+		hz_actual = _get_field(False, output, None, '', 'cpu MHz', 'cpu speed', 'clock')
+		hz_actual = hz_actual.lower().rstrip('mhz').strip()
+		hz_actual = to_hz_string(hz_actual)
+
+		# Convert from GHz/MHz string to Hz
+		scale, hz_advertised = _get_hz_string_from_brand(processor_brand)
+
+		# Try getting the Hz for a BeagleBone
+		if hz_advertised == '0.0':
+			scale, hz_advertised = _get_hz_string_from_beagle_bone()
+			hz_actual = hz_advertised
+
+		# Try getting the Hz for a lscpu
+		if hz_advertised == '0.0':
+			scale, hz_advertised = _get_hz_string_from_lscpu()
+			hz_actual = hz_advertised
+
+		# Get the CPU arch and bits
+		arch, bits = parse_arch(DataSource.raw_arch_string)
+
+		return {
+		'vendor_id' : vendor_id,
+		'hardware' : hardware,
+		'brand' : processor_brand,
+
+		'hz_advertised' : to_friendly_hz(hz_advertised, scale),
+		'hz_actual' : to_friendly_hz(hz_actual, 6),
+		'hz_advertised_raw' : to_raw_hz(hz_advertised, scale),
+		'hz_actual_raw' : to_raw_hz(hz_actual, 6),
+
+		'arch' : arch,
+		'bits' : bits,
+		'count' : DataSource.cpu_count,
+		'raw_arch_string' : DataSource.raw_arch_string,
+
+		'l2_cache_size' : cache_size,
+		'l2_cache_line_size' : 0,
+		'l2_cache_associativity' : 0,
+
+		'stepping' : stepping,
+		'model' : model,
+		'family' : family,
+		'processor_type' : 0,
+		'extended_model' : 0,
+		'extended_family' : 0,
+		'flags' : flags
+		}
+	except:
+		#raise # NOTE: To have this throw on error, uncomment this line
+		return None
+
+def get_cpu_info_from_dmesg():
+	'''
+	Returns the CPU info gathered from dmesg. Will return None if
+	dmesg is not found or does not have the desired info.
+	'''
+	try:
+		# Just return None if there is no dmesg
+		if not DataSource.has_dmesg():
+			return None
+
+		# If dmesg fails return None
+		returncode, output = DataSource.dmesg_a()
+		if output == None or returncode != 0:
+			return None
+
+		# Processor Brand
+		long_brand = output.split('CPU: ')[1].split('\n')[0]
+		processor_brand = long_brand.rsplit('(', 1)[0]
+		processor_brand = processor_brand.strip()
+
+		# Hz
+		scale = 0
+		hz_actual = long_brand.rsplit('(', 1)[1].split(' ')[0].lower()
+		if hz_actual.endswith('mhz'):
+			scale = 6
+		elif hz_actual.endswith('ghz'):
+			scale = 9
+		hz_actual = hz_actual.split('-')[0]
+		hz_actual = to_hz_string(hz_actual)
+
+		# Various fields
+		fields = output.split('CPU: ')[1].split('\n')[1].split('\n')[0].strip().split('  ')
+		vendor_id = None
+		stepping = None
+		model = None
+		family = None
+		for field in fields:
+			name, value = field.split('=')
+			name = name.strip().lower()
+			value = value.strip()
+			if name == 'origin':
+				vendor_id = value.strip('"')
+			elif name == 'stepping':
+				stepping = int(value)
+			elif name == 'model':
+				model = int(value, 16)
+			elif name == 'family':
+				family = int(value, 16)
+
+		# Flags
+		flag_lines = []
+		for category in ['  Features=', '  Features2=', '  AMD Features=', '  AMD Features2=']:
+			if category in output:
+				flag_lines.append(output.split(category)[1].split('\n')[0])
+
+		flags = []
+		for line in flag_lines:
+			line = line.split('<')[1].split('>')[0].lower()
+			for flag in line.split(','):
+				flags.append(flag)
+		flags.sort()
+
+		# Convert from GHz/MHz string to Hz
+		scale, hz_advertised = _get_hz_string_from_brand(processor_brand)
+
+		# Get the CPU arch and bits
+		arch, bits = parse_arch(DataSource.raw_arch_string)
+
+		return {
+		'vendor_id' : vendor_id,
+		'hardware' : '',
+		'brand' : processor_brand,
+
+		'hz_advertised' : to_friendly_hz(hz_advertised, scale),
+		'hz_actual' : to_friendly_hz(hz_actual, 6),
+		'hz_advertised_raw' : to_raw_hz(hz_advertised, scale),
+		'hz_actual_raw' : to_raw_hz(hz_actual, 6),
+
+		'arch' : arch,
+		'bits' : bits,
+		'count' : DataSource.cpu_count,
+		'raw_arch_string' : DataSource.raw_arch_string,
+
+		'l2_cache_size' : 0,
+		'l2_cache_line_size' : 0,
+		'l2_cache_associativity' : 0,
+
+		'stepping' : stepping,
+		'model' : model,
+		'family' : family,
+		'processor_type' : 0,
+		'extended_model' : 0,
+		'extended_family' : 0,
+		'flags' : flags
+		}
+	except:
+		return None
+
+def get_cpu_info_from_sysctl():
+	'''
+	Returns the CPU info gathered from sysctl. Will return None if
+	sysctl is not found.
+	'''
+	try:
+		# Just return None if there is no sysctl
+		if not DataSource.has_sysctl():
+			return None
+
+		# If sysctl fails return None
+		returncode, output = DataSource.sysctl_machdep_cpu_hw_cpufrequency()
+		if output == None or returncode != 0:
+			return None
+
+		# Various fields
+		vendor_id = _get_field(False, output, None, None, 'machdep.cpu.vendor')
+		processor_brand = _get_field(True, output, None, None, 'machdep.cpu.brand_string')
+		cache_size = _get_field(False, output, None, None, 'machdep.cpu.cache.size')
+		stepping = _get_field(False, output, int, 0, 'machdep.cpu.stepping')
+		model = _get_field(False, output, int, 0, 'machdep.cpu.model')
+		family = _get_field(False, output, int, 0, 'machdep.cpu.family')
+
+		# Flags
+		flags = _get_field(False, output, None, None, 'machdep.cpu.features').lower().split()
+		flags.sort()
+
+		# Convert from GHz/MHz string to Hz
+		scale, hz_advertised = _get_hz_string_from_brand(processor_brand)
+		hz_actual = _get_field(False, output, None, None, 'hw.cpufrequency')
+		hz_actual = to_hz_string(hz_actual)
+
+		# Get the CPU arch and bits
+		arch, bits = parse_arch(DataSource.raw_arch_string)
+
+		return {
+		'vendor_id' : vendor_id,
+		'hardware' : '',
+		'brand' : processor_brand,
+
+		'hz_advertised' : to_friendly_hz(hz_advertised, scale),
+		'hz_actual' : to_friendly_hz(hz_actual, 0),
+		'hz_advertised_raw' : to_raw_hz(hz_advertised, scale),
+		'hz_actual_raw' : to_raw_hz(hz_actual, 0),
+
+		'arch' : arch,
+		'bits' : bits,
+		'count' : DataSource.cpu_count,
+		'raw_arch_string' : DataSource.raw_arch_string,
+
+		'l2_cache_size' : cache_size,
+		'l2_cache_line_size' : 0,
+		'l2_cache_associativity' : 0,
+
+		'stepping' : stepping,
+		'model' : model,
+		'family' : family,
+		'processor_type' : 0,
+		'extended_model' : 0,
+		'extended_family' : 0,
+		'flags' : flags
+		}
+	except:
+		return None
+
+def get_cpu_info_from_sysinfo():
+	'''
+	Returns the CPU info gathered from sysinfo. Will return None if
+	sysinfo is not found.
+	'''
+	try:
+		# Just return None if there is no sysinfo
+		if not DataSource.has_sysinfo():
+			return None
+
+		# If sysinfo fails return None
+		returncode, output = DataSource.sysinfo_cpu()
+		if output == None or returncode != 0:
+			return None
+
+		# Various fields
+		vendor_id = '' #_get_field(False, output, None, None, 'CPU #0: ')
+		processor_brand = output.split('CPU #0: "')[1].split('"\n')[0]
+		cache_size = '' #_get_field(False, output, None, None, 'machdep.cpu.cache.size')
+		stepping = int(output.split(', stepping ')[1].split(',')[0].strip())
+		model = int(output.split(', model ')[1].split(',')[0].strip())
+		family = int(output.split(', family ')[1].split(',')[0].strip())
+
+		# Flags
+		flags = []
+		for line in output.split('\n'):
+			if line.startswith('\t\t'):
+				for flag in line.strip().lower().split():
+					flags.append(flag)
+		flags.sort()
+
+		# Convert from GHz/MHz string to Hz
+		scale, hz_advertised = _get_hz_string_from_brand(processor_brand)
+		hz_actual = hz_advertised
+
+		# Get the CPU arch and bits
+		arch, bits = parse_arch(DataSource.raw_arch_string)
+
+		return {
+		'vendor_id' : vendor_id,
+		'hardware' : '',
+		'brand' : processor_brand,
+
+		'hz_advertised' : to_friendly_hz(hz_advertised, scale),
+		'hz_actual' : to_friendly_hz(hz_actual, scale),
+		'hz_advertised_raw' : to_raw_hz(hz_advertised, scale),
+		'hz_actual_raw' : to_raw_hz(hz_actual, scale),
+
+		'arch' : arch,
+		'bits' : bits,
+		'count' : DataSource.cpu_count,
+		'raw_arch_string' : DataSource.raw_arch_string,
+
+		'l2_cache_size' : cache_size,
+		'l2_cache_line_size' : 0,
+		'l2_cache_associativity' : 0,
+
+		'stepping' : stepping,
+		'model' : model,
+		'family' : family,
+		'processor_type' : 0,
+		'extended_model' : 0,
+		'extended_family' : 0,
+		'flags' : flags
+		}
+	except:
+		return None
+
+def get_cpu_info_from_registry():
+	'''
+	FIXME: Is missing many of the newer CPU flags like sse3
+	Returns the CPU info gathered from the Windows Registry. Will return None if
+	not on Windows.
+	'''
+	try:
+		# Just return None if not on Windows
+		if not DataSource.is_windows:
+			return None
+
+		# Get the CPU name
+		processor_brand = DataSource.winreg_processor_brand()
+
+		# Get the CPU vendor id
+		vendor_id = DataSource.winreg_vendor_id()
+
+		# Get the CPU arch and bits
+		raw_arch_string = DataSource.winreg_raw_arch_string()
+		arch, bits = parse_arch(raw_arch_string)
+
+		# Get the actual CPU Hz
+		hz_actual = DataSource.winreg_hz_actual()
+		hz_actual = to_hz_string(hz_actual)
+
+		# Get the advertised CPU Hz
+		scale, hz_advertised = _get_hz_string_from_brand(processor_brand)
+
+		# Get the CPU features
+		feature_bits = DataSource.winreg_feature_bits()
+
+		def is_set(bit):
+			mask = 0x80000000 >> bit
+			retval = mask & feature_bits > 0
+			return retval
+
+		# http://en.wikipedia.org/wiki/CPUID
+		# http://unix.stackexchange.com/questions/43539/what-do-the-flags-in-proc-cpuinfo-mean
+		# http://www.lohninger.com/helpcsuite/public_constants_cpuid.htm
+		flags = {
+			'fpu' : is_set(0), # Floating Point Unit
+			'vme' : is_set(1), # V86 Mode Extensions
+			'de' : is_set(2), # Debug Extensions - I/O breakpoints supported
+			'pse' : is_set(3), # Page Size Extensions (4 MB pages supported)
+			'tsc' : is_set(4), # Time Stamp Counter and RDTSC instruction are available
+			'msr' : is_set(5), # Model Specific Registers
+			'pae' : is_set(6), # Physical Address Extensions (36 bit address, 2MB pages)
+			'mce' : is_set(7), # Machine Check Exception supported
+			'cx8' : is_set(8), # Compare Exchange Eight Byte instruction available
+			'apic' : is_set(9), # Local APIC present (multiprocessor operation support)
+			'sepamd' : is_set(10), # Fast system calls (AMD only)
+			'sep' : is_set(11), # Fast system calls
+			'mtrr' : is_set(12), # Memory Type Range Registers
+			'pge' : is_set(13), # Page Global Enable
+			'mca' : is_set(14), # Machine Check Architecture
+			'cmov' : is_set(15), # Conditional MOVe instructions
+			'pat' : is_set(16), # Page Attribute Table
+			'pse36' : is_set(17), # 36 bit Page Size Extensions
+			'serial' : is_set(18), # Processor Serial Number
+			'clflush' : is_set(19), # Cache Flush
+			#'reserved1' : is_set(20), # reserved
+			'dts' : is_set(21), # Debug Trace Store
+			'acpi' : is_set(22), # ACPI support
+			'mmx' : is_set(23), # MultiMedia Extensions
+			'fxsr' : is_set(24), # FXSAVE and FXRSTOR instructions
+			'sse' : is_set(25), # SSE instructions
+			'sse2' : is_set(26), # SSE2 (WNI) instructions
+			'ss' : is_set(27), # self snoop
+			#'reserved2' : is_set(28), # reserved
+			'tm' : is_set(29), # Automatic clock control
+			'ia64' : is_set(30), # IA64 instructions
+			'3dnow' : is_set(31) # 3DNow! instructions available
+		}
+
+		# Get a list of only the flags that are true
+		flags = [k for k, v in flags.items() if v]
+		flags.sort()
+
+		return {
+		'vendor_id' : vendor_id,
+		'hardware' : '',
+		'brand' : processor_brand,
+
+		'hz_advertised' : to_friendly_hz(hz_advertised, scale),
+		'hz_actual' : to_friendly_hz(hz_actual, 6),
+		'hz_advertised_raw' : to_raw_hz(hz_advertised, scale),
+		'hz_actual_raw' : to_raw_hz(hz_actual, 6),
+
+		'arch' : arch,
+		'bits' : bits,
+		'count' : DataSource.cpu_count,
+		'raw_arch_string' : raw_arch_string,
+
+		'l2_cache_size' : 0,
+		'l2_cache_line_size' : 0,
+		'l2_cache_associativity' : 0,
+
+		'stepping' : 0,
+		'model' : 0,
+		'family' : 0,
+		'processor_type' : 0,
+		'extended_model' : 0,
+		'extended_family' : 0,
+		'flags' : flags
+		}
+	except:
+		return None
+
+def get_cpu_info_from_kstat():
+	'''
+	Returns the CPU info gathered from isainfo and kstat. Will
+	return None if isainfo or kstat are not found.
+	'''
+	try:
+		# Just return None if there is no isainfo or kstat
+		if not DataSource.has_isainfo() or not DataSource.has_kstat():
+			return None
+
+		# If isainfo fails return None
+		returncode, flag_output = DataSource.isainfo_vb()
+		if flag_output == None or returncode != 0:
+			return None
+
+		# If kstat fails return None
+		returncode, kstat = DataSource.kstat_m_cpu_info()
+		if kstat == None or returncode != 0:
+			return None
+
+		# Various fields
+		vendor_id = kstat.split('\tvendor_id ')[1].split('\n')[0].strip()
+		processor_brand = kstat.split('\tbrand ')[1].split('\n')[0].strip()
+		cache_size = 0
+		stepping = int(kstat.split('\tstepping ')[1].split('\n')[0].strip())
+		model = int(kstat.split('\tmodel ')[1].split('\n')[0].strip())
+		family = int(kstat.split('\tfamily ')[1].split('\n')[0].strip())
+
+		# Flags
+		flags = flag_output.strip().split('\n')[-1].strip().lower().split()
+		flags.sort()
+
+		# Convert from GHz/MHz string to Hz
+		scale = 6
+		hz_advertised = kstat.split('\tclock_MHz ')[1].split('\n')[0].strip()
+		hz_advertised = to_hz_string(hz_advertised)
+
+		# Convert from GHz/MHz string to Hz
+		hz_actual = kstat.split('\tcurrent_clock_Hz ')[1].split('\n')[0].strip()
+		hz_actual = to_hz_string(hz_actual)
+
+		# Get the CPU arch and bits
+		arch, bits = parse_arch(DataSource.raw_arch_string)
+
+		return {
+		'vendor_id' : vendor_id,
+		'hardware' : '',
+		'brand' : processor_brand,
+
+		'hz_advertised' : to_friendly_hz(hz_advertised, scale),
+		'hz_actual' : to_friendly_hz(hz_actual, 0),
+		'hz_advertised_raw' : to_raw_hz(hz_advertised, scale),
+		'hz_actual_raw' : to_raw_hz(hz_actual, 0),
+
+		'arch' : arch,
+		'bits' : bits,
+		'count' : DataSource.cpu_count,
+		'raw_arch_string' : DataSource.raw_arch_string,
+
+		'l2_cache_size' : cache_size,
+		'l2_cache_line_size' : 0,
+		'l2_cache_associativity' : 0,
+
+		'stepping' : stepping,
+		'model' : model,
+		'family' : family,
+		'processor_type' : 0,
+		'extended_model' : 0,
+		'extended_family' : 0,
+		'flags' : flags
+		}
+	except:
+		return None
+
+def get_cpu_info():
+	info = None
+
+	# Try the Windows registry
+	if not info:
+		info = get_cpu_info_from_registry()
+
+	# Try /proc/cpuinfo
+	if not info:
+		info = get_cpu_info_from_proc_cpuinfo()
+
+	# Try sysctl
+	if not info:
+		info = get_cpu_info_from_sysctl()
+
+	# Try kstat
+	if not info:
+		info = get_cpu_info_from_kstat()
+
+	# Try dmesg
+	if not info:
+		info = get_cpu_info_from_dmesg()
+
+	# Try sysinfo
+	if not info:
+		info = get_cpu_info_from_sysinfo()
+
+	# Try querying the CPU cpuid register
+	if not info:
+		info = get_cpu_info_from_cpuid()
+
+	return info
+
+# Make sure we are running on a supported system
+def _check_arch():
+	arch, bits = parse_arch(DataSource.raw_arch_string)
+	if not arch in ['X86_32', 'X86_64', 'ARM_7', 'ARM_8']:
+		raise Exception("py-cpuinfo currently only works on X86 and some ARM CPUs.")
+
+def main():
+	try:
+		_check_arch()
+	except Exception as err:
+		sys.stderr.write(str(err) + "\n")
+		sys.exit(1)
+
+	info = get_cpu_info()
+	if info:
+		print('Vendor ID: {0}'.format(info.get('vendor_id', '')))
+		print('Hardware Raw: {0}'.format(info.get('hardware', '')))
+		print('Brand: {0}'.format(info.get('brand', '')))
+		print('Hz Advertised: {0}'.format(info.get('hz_advertised', '')))
+		print('Hz Actual: {0}'.format(info.get('hz_actual', '')))
+		print('Hz Advertised Raw: {0}'.format(info.get('hz_advertised_raw', '')))
+		print('Hz Actual Raw: {0}'.format(info.get('hz_actual_raw', '')))
+		print('Arch: {0}'.format(info.get('arch', '')))
+		print('Bits: {0}'.format(info.get('bits', '')))
+		print('Count: {0}'.format(info.get('count', '')))
+
+		print('Raw Arch String: {0}'.format(info.get('raw_arch_string', '')))
+
+		print('L2 Cache Size: {0}'.format(info.get('l2_cache_size', '')))
+		print('L2 Cache Line Size: {0}'.format(info.get('l2_cache_line_size', '')))
+		print('L2 Cache Associativity: {0}'.format(info.get('l2_cache_associativity', '')))
+
+		print('Stepping: {0}'.format(info.get('stepping', '')))
+		print('Model: {0}'.format(info.get('model', '')))
+		print('Family: {0}'.format(info.get('family', '')))
+		print('Processor Type: {0}'.format(info.get('processor_type', '')))
+		print('Extended Model: {0}'.format(info.get('extended_model', '')))
+		print('Extended Family: {0}'.format(info.get('extended_family', '')))
+		print('Flags: {0}'.format(', '.join(info.get('flags', ''))))
+	else:
+		sys.stderr.write("Failed to find cpu info\n")
+		sys.exit(1)
+
+
+if __name__ == '__main__':
+	main()
+else:
+	_check_arch()
diff --git a/hdf5-blosc/.gitignore b/hdf5-blosc/.gitignore
new file mode 100644
index 000000000..31d31148b
--- /dev/null
+++ b/hdf5-blosc/.gitignore
@@ -0,0 +1,36 @@
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
+
+# Anything in the 'build' folder.
+build/
+
diff --git a/hdf5-blosc/.travis.yml b/hdf5-blosc/.travis.yml
new file mode 100644
index 000000000..95ad44bca
--- /dev/null
+++ b/hdf5-blosc/.travis.yml
@@ -0,0 +1,22 @@
+language: c
+
+os:
+- linux
+- osx
+
+compiler:
+  - gcc
+  - clang
+
+before_install: ./travis-before-install.sh
+
+install: sudo apt-get install libhdf5-serial-dev
+
+before_script:
+  - mkdir build
+  - cd build
+  - cmake ..
+
+script:
+  - cmake --build . --config Release
+  - ctest
diff --git a/hdf5-blosc/CMakeLists.txt b/hdf5-blosc/CMakeLists.txt
new file mode 100644
index 000000000..8644a048a
--- /dev/null
+++ b/hdf5-blosc/CMakeLists.txt
@@ -0,0 +1,71 @@
+cmake_minimum_required(VERSION 2.8.10)
+project(blosc_hdf5)
+include(ExternalProject)
+
+# options
+option(BUILD_TESTS
+    "Build test programs form the blosc filter" ON)
+
+set(BLOSC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/blosc")
+set(BLOSC_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/blosc")
+set(BLOSC_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${BLOSC_INSTALL_DIR})
+
+message("BLOSC_PREFIX='${BLOSC_PREFIX}'")
+message("BLOSC_INSTALL_DIR='${BLOSC_INSTALL_DIR}'")
+message("BLOSC_CMAKE_ARGS='${BLOSC_CMAKE_ARGS}'")
+message("GIT_EXECUTABLE='${GIT_EXECUTABLE}'")
+
+ExternalProject_Add(blosc
+  PREFIX ${BLOSC_PREFIX}
+  GIT_REPOSITORY https://github.com/Blosc/c-blosc.git
+  INSTALL_DIR ${BLOSC_INSTALL_DIR}
+  CMAKE_ARGS ${BLOSC_CMAKE_ARGS}
+)
+
+
+# sources
+set(SOURCES src/blosc_filter.c)
+
+# dependencies
+if(MSVC)
+    # FindHDF5.cmake does not find Windows installations. Try to
+    # use an environment variable instead until the official "find"
+    # file can be updated for Windows.
+    #
+    # Note that you have to set this environment variable by hand.
+    file(TO_CMAKE_PATH "$ENV{HDF5_DIR}" HDF5_HINT)
+    set(HDF5_DIR ${HDF5_HINT} CACHE STRING "Path to HDF5 CMake config directory.")
+    find_package(HDF5 REQUIRED HINTS ${HDF5_DIR})
+else(MSVC)
+    find_package(HDF5 REQUIRED)
+endif(MSVC)
+include_directories(${HDF5_INCLUDE_DIRS})
+
+
+# add blosc libraries
+add_library(blosc_shared SHARED IMPORTED)
+set_property(TARGET blosc_shared PROPERTY IMPORTED_LOCATION ${BLOSC_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}blosc${CMAKE_SHARED_LIBRARY_SUFFIX})
+add_dependencies(blosc_shared project_blosc)
+include_directories(${BLOSC_INSTALL_DIR}/include)
+
+add_library(blosc_filter_shared SHARED ${SOURCES})
+set_target_properties(
+  blosc_filter_shared PROPERTIES OUTPUT_NAME blosc_filter)
+target_link_libraries(blosc_filter_shared blosc_shared ${HDF5_LIBRARIES})
+
+# install
+install(FILES src/blosc_filter.h DESTINATION include COMPONENT HDF5_FILTER_DEV)
+install(TARGETS blosc_filter_shared DESTINATION lib COMPONENT HDF5_FILTER_DEV)
+
+
+# test
+message("LINK LIBRARIES='blosc_filter_shared ${HDF5_LIBRARIES}'")
+if(BUILD_TESTS)
+    enable_testing()
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    find_package(Threads REQUIRED)
+    set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
+    add_executable(example src/example.c)
+    target_link_libraries(example blosc_filter_shared ${HDF5_LIBRARIES} ${LIBS})
+    add_test(test_hdf5_filter example)
+endif(BUILD_TESTS)
diff --git a/hdf5-blosc/LICENSES/BLOSC.txt b/hdf5-blosc/LICENSES/BLOSC.txt
new file mode 100644
index 000000000..55b1392c5
--- /dev/null
+++ b/hdf5-blosc/LICENSES/BLOSC.txt
@@ -0,0 +1,21 @@
+Blosc - A blocking, shuffling and lossless compression library
+
+Copyright (C) 2009-2015 Francesc Alted <francesc@blosc.org>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/hdf5-blosc/LICENSES/BLOSC_HDF5.txt b/hdf5-blosc/LICENSES/BLOSC_HDF5.txt
new file mode 100644
index 000000000..9c4c14566
--- /dev/null
+++ b/hdf5-blosc/LICENSES/BLOSC_HDF5.txt
@@ -0,0 +1,21 @@
+Blosc for HDF5 - An HDF5 filter that uses the Blosc compressor.
+
+Copyright (C) 2009-2015 Francesc Alted <francesc@blosc.org>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/c-blosc/LICENSES/H5PY.txt b/hdf5-blosc/LICENSES/H5PY.txt
similarity index 100%
rename from c-blosc/LICENSES/H5PY.txt
rename to hdf5-blosc/LICENSES/H5PY.txt
diff --git a/hdf5-blosc/README.rst b/hdf5-blosc/README.rst
new file mode 100644
index 000000000..147c99dfe
--- /dev/null
+++ b/hdf5-blosc/README.rst
@@ -0,0 +1,69 @@
+=====================
+Blosc filter for HDF5
+=====================
+
+:Travis CI: |travis|
+:And...: |powered|
+
+.. |travis| image:: https://travis-ci.org/Blosc/hdf5.png?branch=master
+        :target: https://travis-ci.org/Blosc/hdf5
+
+.. |powered| image:: http://b.repl.ca/v1/Powered--By-Blosc-blue.png
+        :target: https://blosc.org
+
+This is an example of filter for HDF5 that uses the Blosc compressor.
+
+You need to be a bit careful before using this filter because you
+should not activate the shuffle right in HDF5, but rather from Blosc
+itself.  This is because Blosc uses an SIMD shuffle internally which
+is much faster.
+
+
+Using the Blosc filter from HDF5
+================================
+
+In order to register Blosc into your HDF5 application, you only need
+to call a function in blosc_filter.h, with the following signature:
+
+    int register_blosc(char **version, char **date)
+
+Calling this will register the filter with the HDF5 library and will
+return info about the Blosc release in `**version` and `**date`
+char pointers.
+
+A non-negative return value indicates success.  If the registration
+fails, an error is pushed onto the current error stack and a negative
+value is returned.
+
+An example C program ('src/example.c') is included which demonstrates
+the proper use of the filter.
+
+This filter has been tested against HDF5 versions 1.6.5 through
+1.8.10.  It is released under the MIT license (see LICENSE.txt for
+details).
+
+
+Compiling
+=========
+
+The filter consists of a single 'src/blosc_filter.c' source file and
+'src/blosc_filter.h' header, which will need the Blosc library
+installed to work.
+
+
+As an HDF5 plugin
+=================
+
+Also, you can use blosc as an HDF5 plugin; see 'src/blosc_plugin.c' for
+details.
+
+
+Acknowledgments
+===============
+
+See THANKS.rst.
+
+
+----
+
+  **Enjoy data!**
diff --git a/c-blosc/hdf5/blosc_filter.c b/hdf5-blosc/src/blosc_filter.c
similarity index 96%
rename from c-blosc/hdf5/blosc_filter.c
rename to hdf5-blosc/src/blosc_filter.c
index a4b696220..3ab523736 100644
--- a/c-blosc/hdf5/blosc_filter.c
+++ b/hdf5-blosc/src/blosc_filter.c
@@ -105,7 +105,7 @@ herr_t blosc_set_local(hid_t dcpl, hid_t type, hid_t space){
     size_t nelements = 8;
     unsigned int values[] = {0,0,0,0,0,0,0,0};
     hid_t super_type;
-    H5T_class_t class;
+    H5T_class_t classt;
 
     r = GET_FILTER(dcpl, FILTER_BLOSC, &flags, &nelements, values, 0, NULL);
     if(r<0) return -1;
@@ -126,8 +126,8 @@ herr_t blosc_set_local(hid_t dcpl, hid_t type, hid_t space){
     typesize = H5Tget_size(type);
     if (typesize==0) return -1;
     /* Get the size of the base type, even for ARRAY types */
-    class = H5Tget_class(type);
-    if (class == H5T_ARRAY) {
+    classt = H5Tget_class(type);
+    if (classt == H5T_ARRAY) {
       /* Get the array base component */
       super_type = H5Tget_super(type);
       basetypesize = H5Tget_size(super_type);
@@ -175,7 +175,7 @@ size_t blosc_filter(unsigned flags, size_t cd_nelmts,
     int doshuffle = 1;             /* Shuffle default */
     int compcode;                  /* Blosc compressor */
     int code;
-    char *compname = "blosclz";
+    char *compname = "blosclz";    /* The compressor by default */
     char *complist;
     char errmsg[256];
 
@@ -234,10 +234,6 @@ size_t blosc_filter(unsigned flags, size_t cd_nelmts,
         }
 
 #if ( (BLOSC_VERSION_MAJOR <= 1) && (BLOSC_VERSION_MINOR < 5) )
-	/* Select the correct compressor to use */
-        if (compname != NULL)
-	  blosc_set_compressor(compname);
-
         status = blosc_compress(clevel, doshuffle, typesize, nbytes,
                                 *buf, outbuf, nbytes);
 #else
@@ -281,7 +277,7 @@ size_t blosc_filter(unsigned flags, size_t cd_nelmts,
         }
 
 #if ( (BLOSC_VERSION_MAJOR <= 1) && (BLOSC_VERSION_MINOR < 5) )
-        status = blosc_decompress(*buf, outbuf, outbuf_size);
+	status = blosc_decompress(*buf, outbuf, outbuf_size);
 #else
         /* Starting from Blosc 1.5 on, there is not an internal global
 	   lock anymore, so do not try to run in multithreading mode
diff --git a/c-blosc/hdf5/blosc_filter.h b/hdf5-blosc/src/blosc_filter.h
similarity index 100%
rename from c-blosc/hdf5/blosc_filter.h
rename to hdf5-blosc/src/blosc_filter.h
diff --git a/c-blosc/hdf5/blosc_plugin.c b/hdf5-blosc/src/blosc_plugin.c
similarity index 100%
rename from c-blosc/hdf5/blosc_plugin.c
rename to hdf5-blosc/src/blosc_plugin.c
diff --git a/c-blosc/hdf5/blosc_plugin.h b/hdf5-blosc/src/blosc_plugin.h
similarity index 100%
rename from c-blosc/hdf5/blosc_plugin.h
rename to hdf5-blosc/src/blosc_plugin.h
diff --git a/c-blosc/hdf5/example.c b/hdf5-blosc/src/example.c
similarity index 93%
rename from c-blosc/hdf5/example.c
rename to hdf5-blosc/src/example.c
index 3b386e330..d72b3fc7b 100644
--- a/c-blosc/hdf5/example.c
+++ b/hdf5-blosc/src/example.c
@@ -1,6 +1,6 @@
 /*
     Copyright (C) 2010  Francesc Alted
-    http://blosc.pytables.org
+    http://blosc.org
     License: MIT (see LICENSE.txt)
 
     Example program demonstrating use of the Blosc filter from C code.
@@ -9,8 +9,7 @@
 
     To compile this program:
 
-    h5cc [-DH5_USE_16_API] -lblosc blosc_filter.c example.c \
-         -o example -lpthread
+    h5cc blosc_filter.c example.c -o example -lblosc -lpthread
 
     To run:
 
@@ -73,7 +72,7 @@ int main(){
     r = H5Pset_chunk(plist, 3, chunkshape);
     if(r<0) goto failed;
 
-    /* Using the blosc filter in combianation with other ones also works */
+    /* Using the blosc filter in combination with other ones also works */
     /*
     r = H5Pset_fletcher32(plist);
     if(r<0) goto failed;
@@ -81,7 +80,7 @@ int main(){
 
     /* This is the easiest way to call Blosc with default values: 5
      for BloscLZ and shuffle active. */
-    /* r = H5Pset_filter(plist, FILTER_BLOSC, H5Z_FLAG_OPTIONAL, 0, NULL);  */
+    /* r = H5Pset_filter(plist, FILTER_BLOSC, H5Z_FLAG_OPTIONAL, 0, NULL); */
 
     /* But you can also taylor Blosc parameters to your needs */
     /* 0 to 3 (inclusive) param slots are reserved. */
diff --git a/hdf5-blosc/travis-before-install.sh b/hdf5-blosc/travis-before-install.sh
new file mode 100755
index 000000000..8dacf772f
--- /dev/null
+++ b/hdf5-blosc/travis-before-install.sh
@@ -0,0 +1,16 @@
+#/bin/sh -f
+
+# things to do for travis-ci in the before_install section
+
+if ( test "`uname -s`" = "Darwin" )
+then
+  #cmake v2.8.12 is installed on the Mac workers now
+  #brew update
+  #brew install cmake
+  echo
+else
+  #install a newer cmake since at this time Travis only has version 2.8.7
+  sudo add-apt-repository --yes ppa:kalakris/cmake
+  sudo apt-get update -qq
+  sudo apt-get install cmake
+fi
diff --git a/setup.py b/setup.py
index 5a966db9d..6ecbd8151 100755
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,10 @@
 # This is also what pandas does.
 from setuptools.command.build_ext import build_ext
 
+# For guessing the capabilities of the CPU for C-Blosc
+import cpuinfo
+cpu_info = cpuinfo.get_cpu_info()
+
 # The name for the pkg-config utility
 PKG_CONFIG = 'pkg-config'
 
@@ -128,7 +132,7 @@ def print_warning(head, body=''):
 
 # Global variables
 lib_dirs = []
-inc_dirs = ['c-blosc/hdf5']
+inc_dirs = [os.path.join('hdf5-blosc', 'src')]
 optional_libs = []
 data_files = []    # list of data files to add to packages (mainly for DLL's)
 
@@ -483,7 +487,10 @@ def get_hdf5_version(headername):
 # variable to rebuild pytables
 if not HDF5_DIR and os.name == 'nt':
     import ctypes.util
-    libdir = ctypes.util.find_library('hdf5dll.dll')
+    if not debug:
+        libdir = ctypes.util.find_library('hdf5.dll') or ctypes.util.find_library('hdf5dll.dll')
+    else:
+        libdir = ctypes.util.find_library('hdf5_D.dll') or ctypes.util.find_library('hdf5ddll.dll')
     # Like 'C:\\Program Files\\HDF Group\\HDF5\\1.8.8\\bin\\hdf5dll.dll'
     if libdir:
         # Strip off the filename
@@ -547,6 +554,27 @@ def get_hdf5_version(headername):
     (hdrdir, libdir, rundir) = package.find_directories(
         location, use_pkgconfig=USE_PKGCONFIG)
 
+<<<<<<< HEAD
+=======
+    # check if HDF5 library uses old DLL naming scheme
+    if hdrdir and package.tag == 'HDF5':
+        hdf5_header = os.path.join(hdrdir, "H5public.h")
+        hdf5_version = get_hdf5_version(hdf5_header)
+        if hdf5_version < min_hdf5_version:
+            exit_with_error(
+                "Unsupported HDF5 version! HDF5 v%s+ required. "
+                "Found version v%s" % (
+                    '.'.join(map(str, min_hdf5_version)),
+                    '.'.join(map(str, hdf5_version))))
+
+        if os.name == 'nt' and hdf5_version < (1, 8, 10):
+            hdf5_old_dll_name = 'hdf5dll' if not debug else 'hdf5ddll'
+            package.library_name = hdf5_old_dll_name
+            package.runtime_name = hdf5_old_dll_name
+            _platdep['HDF5'] = [hdf5_old_dll_name, hdf5_old_dll_name]
+            _, libdir, rundir = package.find_directories(location, use_pkgconfig=USE_PKGCONFIG)
+
+>>>>>>> ac4609f... Internal C-Blosc bumped to 1.8.1
     # check if the library is in the standard compiler paths
     if not libdir and package.target_function:
         libdir = compiler.has_function(package.target_function,
@@ -729,17 +757,18 @@ def find_name(base='tables'):
 ADDLIBS = [hdf5_package.library_name]
 
 # List of Blosc file dependencies
-blosc_files = ["c-blosc/hdf5/blosc_filter.c"]
+blosc_sources = ["hdf5-blosc/src/blosc_filter.c"]
 if 'BLOSC' not in optional_libs:
     # Compiling everything from sources
     # Blosc + BloscLZ sources
-    blosc_files += glob.glob('c-blosc/blosc/*.c')
+    blosc_sources += [f for f in glob.glob('c-blosc/blosc/*.c')
+                      if 'avx2' not in f and 'sse2' not in f]
     # LZ4 sources
-    blosc_files += glob.glob('c-blosc/internal-complibs/lz4*/*.c')
+    blosc_sources += glob.glob('c-blosc/internal-complibs/lz4*/*.c')
     # Snappy sources
-    blosc_files += glob.glob('c-blosc/internal-complibs/snappy*/*.cc')
+    blosc_sources += glob.glob('c-blosc/internal-complibs/snappy*/*.cc')
     # Zlib sources
-    blosc_files += glob.glob('c-blosc/internal-complibs/zlib*/*.c')
+    blosc_sources += glob.glob('c-blosc/internal-complibs/zlib*/*.c')
     # Finally, add all the include dirs...
     inc_dirs += [os.path.join('c-blosc', 'blosc')]
     inc_dirs += glob.glob('c-blosc/internal-complibs/*')
@@ -761,12 +790,23 @@ def compiler_has_flags(compiler, flags):
         finally:
             os.remove(fd.name)
 
-    try_flags = ["-march=native", "-msse2"]
-    for ff in try_flags:
-        if compiler_has_flags(compiler, [ff]):
-            print("Setting compiler flag: " + ff)
-            CFLAGS.append(ff)
-            break
+    # Detection code for SSE2/AVX2 only works for gcc/clang, not for MSVC yet
+    # SSE2
+    if ('sse2' in cpu_info['flags'] and
+        compiler_has_flags(compiler, ["-msse2"])):
+        print('SSE2 detected')
+        CFLAGS.append('-DSHUFFLE_SSE2_ENABLED')
+        CFLAGS.append('-msse2')
+        blosc_sources += [f for f in glob.glob('c-blosc/blosc/*.c')
+                          if 'sse2' in f]
+    # AVX2
+    if ('avx2' in cpu_info['flags'] and
+        compiler_has_flags(compiler, ["-mavx2"])):
+        print('AVX2 detected')
+        CFLAGS.append('-DSHUFFLE_AVX2_ENABLED')
+        CFLAGS.append('-mavx2')
+        blosc_sources += [f for f in glob.glob('c-blosc/blosc/*.c')
+                          if 'avx2' in f]
 else:
     ADDLIBS += ['blosc']
 
@@ -796,7 +836,7 @@ def compiler_has_flags(compiler, flags):
                        "src/utils.c",
                        "src/H5ARRAY.c",
                        "src/H5ATTR.c",
-                       ] + blosc_files,
+                       ] + blosc_sources,
               library_dirs=lib_dirs,
               libraries=utilsExtension_libs,
               extra_link_args=LFLAGS,
@@ -812,7 +852,7 @@ def compiler_has_flags(compiler, flags):
                        "src/H5ARRAY-opt.c",
                        "src/H5VLARRAY.c",
                        "src/H5ATTR.c",
-                       ] + blosc_files,
+                       ] + blosc_sources,
               library_dirs=lib_dirs,
               libraries=hdf5Extension_libs,
               extra_link_args=LFLAGS,
@@ -826,7 +866,7 @@ def compiler_has_flags(compiler, flags):
                        "src/typeconv.c",
                        "src/H5TB-opt.c",
                        "src/H5ATTR.c",
-                       ] + blosc_files,
+                       ] + blosc_sources,
               library_dirs=lib_dirs,
               libraries=tableExtension_libs,
               extra_link_args=LFLAGS,