diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..c492825
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.5.6
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..ced47c7
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,209 @@
+# The test build matrix (stage: test) is constructed to test a wide range of
+# configurations, rather than a single pass/fail. This helps to catch build
+# failures and logic errors that present on platforms other than the ones the
+# author has tested.
+#
+# Some builders use the dependency-generator in `./depends`, rather than using
+# apt-get to install build dependencies. This guarantees that the tester is
+# using the same versions as Gitian, so the build results are nearly identical
+# to what would be found in a final release.
+#
+# In order to avoid rebuilding all dependencies for each build, the binaries
+# are cached and re-used when possible. Changes in the dependency-generator
+# will trigger cache-invalidation and rebuilds as necessary.
+#
+# These caches can be manually removed if necessary. This is one of the very
+# few manual operations that is possible with Travis, and it can be done by a
+# Simplicity GitHub member via the Travis web interface [0].
+#
+# Travis CI uploads the cache after the script phase of the build [1].
+# However, the build is terminated without saving the chache if it takes over
+# 50 minutes [2]. Thus, if we spent too much time in early build stages, fail
+# with an error and save the cache.
+#
+# [0] https://travis-ci.org/simplicity-coin/simplicity/caches
+# [1] https://docs.travis-ci.com/user/caching/#build-phases
+# [2] https://docs.travis-ci.com/user/customizing-the-build#build-timeouts
+
+dist: xenial
+os: linux
+language: minimal
+cache:
+  ccache: true
+  directories:
+    - $TRAVIS_BUILD_DIR/depends/built
+    - $TRAVIS_BUILD_DIR/depends/sdk-sources
+    - $HOME/.ccache
+stages:
+  - lint
+  - test
+env:
+  global:
+    - MAKEJOBS=-j3
+    - RUN_UNIT_TESTS=false # todo - fix
+    - RUN_FUNCTIONAL_TESTS=false # Not Yet Implemented
+    - RUN_BENCH=false  # Set to true for any one job that has debug enabled, to quickly check bench is not crashing or hitting assertions
+    - DOCKER_NAME_TAG=ubuntu:18.04
+    - BOOST_TEST_RANDOM=1$TRAVIS_BUILD_ID
+    - CCACHE_SIZE=100M
+    - CCACHE_TEMPDIR=/tmp/.ccache-temp
+    - CCACHE_COMPRESS=1
+    - CCACHE_DIR=$HOME/.ccache
+    - BASE_OUTDIR=$TRAVIS_BUILD_DIR/out
+    - SDK_URL=https://bitcoincore.org/depends-sources/sdks
+    - WINEDEBUG=fixme-all
+    - DOCKER_PACKAGES="build-essential libtool autotools-dev automake pkg-config bsdmainutils curl git ca-certificates ccache"
+    - CACHE_ERR_MSG="Error! Initial build successful, but not enough time remains to run later build stages and tests. Please manually re-run this job by using the travis restart button or asking a bitcoin maintainer to restart. The next run should not time out because the build cache has been saved."
+before_install:
+  - set -o errexit; source .travis/test_03_before_install.sh
+install:
+  - set -o errexit; source .travis/test_04_install.sh
+before_script:
+  - set -o errexit; source .travis/test_05_before_script.sh
+script:
+  - export CONTINUE=1
+  - if [ $SECONDS -gt 1200 ]; then export CONTINUE=0; fi  # Likely the depends build took very long
+  - if [ $CONTINUE = "1" ]; then set -o errexit; source .travis/test_06_script_a.sh; else set +o errexit; echo "$CACHE_ERR_MSG"; false; fi
+  - if [ $SECONDS -gt 1500 ]; then export CONTINUE=0; fi  # Likely the build took very long; The tests take about 1000s, so we should abort if we have less than 50*60-1000=2000s left
+  - if [ $CONTINUE = "1" ]; then set -o errexit; source .travis/test_06_script_b.sh; else set +o errexit; echo "$CACHE_ERR_MSG"; false; fi
+after_script:
+  - echo $TRAVIS_COMMIT_RANGE
+  - echo $TRAVIS_COMMIT_LOG
+jobs:
+  include:
+
+    - stage: lint
+      name: 'lint'
+      env:
+      cache: false
+      language: python
+      python: '3.5' # Oldest supported version according to doc/dependencies.md
+      install:
+        - set -o errexit; source .travis/lint_04_install.sh
+      before_script:
+        - set -o errexit; source .travis/lint_05_before_script.sh
+      script:
+        - set -o errexit; source .travis/lint_06_script.sh
+
+    - stage: test
+      name: 'ARM 32-bit  [GOAL: install]  [no unit or functional tests]'
+      env: >-
+        HOST=arm-linux-gnueabihf
+        PACKAGES="python3 g++-arm-linux-gnueabihf"
+        RUN_UNIT_TESTS=false
+        RUN_FUNCTIONAL_TESTS=false
+        GOAL="install"
+        # -Wno-psabi is to disable ABI warnings: "note: parameter passing for argument of type ... changed in GCC 7.1"
+        # This could be removed once the ABI change warning does not show up by default
+        BITCOIN_CONFIG="--enable-glibc-back-compat --enable-reduce-exports CXXFLAGS=-Wno-psabi"
+
+    - stage: test
+      name: 'ARM 64-bit  [GOAL:install] [no unit or functional tests]'
+      env: >-
+        HOST=aarch64-linux-gnu
+        PACKAGES="python3 g++-aarch64-linux-gnu"
+        RUN_UNIT_TESTS=false
+        RUN_FUNCTIONAL_TESTS=false
+        GOAL="install"
+        BITCOIN_CONFIG="--enable-glibc-back-compat --enable-reduce-exports"
+
+    - stage: test
+      name: 'Win32  [GOAL: deploy] [no functional tests]'
+      env: >-
+        HOST=i686-w64-mingw32
+        DPKG_ADD_ARCH="i386"
+        PACKAGES="python3 nsis g++-mingw-w64-i686 wine-binfmt wine32"
+        RUN_FUNCTIONAL_TESTS=false
+        GOAL="deploy"
+        BITCOIN_CONFIG="--enable-reduce-exports"
+
+    - stage: test
+      name: 'Win64  [GOAL: deploy] [no functional tests]'
+      env: >-
+        HOST=x86_64-w64-mingw32
+        PACKAGES="python3 nsis g++-mingw-w64-x86-64 wine-binfmt wine64"
+        RUN_FUNCTIONAL_TESTS=false
+        GOAL="deploy"
+        BITCOIN_CONFIG="--enable-reduce-exports"
+
+    - stage: test
+      name: '32-bit + dash  [GOAL: install] [no gui]'
+      env: >-
+        HOST=i686-pc-linux-gnu
+        PACKAGES="g++-multilib python3-zmq"
+        GOAL="install"
+        BITCOIN_CONFIG="--enable-zmq --with-gui=qt5 --enable-glibc-back-compat --enable-reduce-exports LDFLAGS=-static-libstdc++"
+        CONFIG_SHELL="/bin/dash"
+
+    #- stage: test # todo - fix
+      #name: 'x86_64 Linux  [GOAL: install]  [bionic]  [uses qt5 dev package instead of depends Qt to speed up build and avoid timeout]'
+      #env: >-
+        #HOST=x86_64-unknown-linux-gnu
+        #PACKAGES="python3-zmq qtbase5-dev qttools5-dev-tools protobuf-compiler libdbus-1-dev libharfbuzz-dev libprotobuf-dev"
+        #DEP_OPTS="NO_QT=1 NO_UPNP=1 DEBUG=1 ALLOW_HOST_PACKAGES=1"
+        #RUN_FUNCTIONAL_TESTS=true
+        ##TEST_RUNNER_EXTRA="--coverage --extended"  # Run extended tests so that coverage does not fail, but exclude the very slow dbcrash
+        #GOAL="install"
+        #BITCOIN_CONFIG="--enable-zmq --with-gui=qt5 --enable-glibc-back-compat --enable-reduce-exports"
+
+    - stage: test
+      name: 'x86_64 Linux  [GOAL: install]  [trusty]  [no functional tests, no depends, only system libs]'
+      env: >-
+        HOST=x86_64-unknown-linux-gnu
+        DOCKER_NAME_TAG=ubuntu:14.04
+        PACKAGES="python3-zmq qtbase5-dev qttools5-dev-tools libicu-dev libpng-dev libssl-dev libevent-dev bsdmainutils libboost-system-dev libboost-filesystem-dev libboost-chrono-dev libboost-program-options-dev libboost-test-dev libboost-thread-dev libdb5.1++-dev libminiupnpc-dev libzmq3-dev libprotobuf-dev protobuf-compiler libqrencode-dev libgmp-dev"
+        NO_DEPENDS=1
+        RUN_FUNCTIONAL_TESTS=false
+        GOAL="install"
+        BITCOIN_CONFIG="--enable-zmq --with-incompatible-bdb --with-gui=no"
+
+    - stage: test
+      name: 'x86_64 Linux  [GOAL: install]  [xenial]  [no depends, only system libs]'
+      env: >-
+        HOST=x86_64-unknown-linux-gnu
+        DOCKER_NAME_TAG=ubuntu:16.04
+        PACKAGES="python3-zmq qtbase5-dev qttools5-dev-tools libssl-dev libevent-dev bsdmainutils libboost-system-dev libboost-filesystem-dev libboost-chrono-dev libboost-program-options-dev libboost-test-dev libboost-thread-dev libdb5.3++-dev libminiupnpc-dev libzmq3-dev libprotobuf-dev protobuf-compiler libqrencode-dev libgmp-dev"
+        NO_DEPENDS=1
+        GOAL="install"
+        BITCOIN_CONFIG="--enable-zmq --with-incompatible-bdb --with-gui=qt5 CPPFLAGS=-DDEBUG_LOCKORDER --disable-hardening --disable-asm"
+
+    - stage: test
+      name: 'x86_64 Linux  [GOAL: install]  [bionic]  [no depends, only system libs]'
+      env: >-
+        HOST=x86_64-unknown-linux-gnu
+        PACKAGES="python3-zmq qtbase5-dev qttools5-dev-tools libssl1.0-dev libevent-dev bsdmainutils libboost-system-dev libboost-filesystem-dev libboost-chrono-dev libboost-program-options-dev libboost-test-dev libboost-thread-dev libdb5.3++-dev libminiupnpc-dev libzmq3-dev libprotobuf-dev protobuf-compiler libqrencode-dev libgmp-dev"
+        NO_DEPENDS=1
+        GOAL="install"
+        BITCOIN_CONFIG="--enable-zmq --with-incompatible-bdb --with-gui=qt5 CPPFLAGS=-DDEBUG_LOCKORDER"
+
+#    - stage: test
+#      name: 'x86_64 Linux  [GOAL: install]  [bionic]  [no depends, only system libs, sanitizers: fuzzer,address]'
+#      env: >-
+#        HOST=x86_64-unknown-linux-gnu
+#        PACKAGES="clang python3-zmq qtbase5-dev qttools5-dev-tools libssl1.0-dev libevent-dev bsdmainutils libboost-system-dev libboost-filesystem-dev libboost-chrono-dev libboost-program-options-dev libboost-test-dev libboost-thread-dev libdb5.3++-dev libminiupnpc-dev libzmq3-dev libprotobuf-dev protobuf-compiler libqrencode-dev libgmp-dev"
+#        NO_DEPENDS=1
+#        RUN_UNIT_TESTS=false
+#        RUN_FUNCTIONAL_TESTS=false
+#        RUN_BENCH=true
+#        GOAL="install"
+#        BITCOIN_CONFIG="--enable-zmq --with-incompatible-bdb --enable-glibc-back-compat --enable-reduce-exports --with-gui=qt5 CPPFLAGS=-DDEBUG_LOCKORDER --with-sanitizers=undefined CC=clang CXX=clang++"
+
+#    - stage: test
+#      name: 'x86_64 Linux  [GOAL: install]  [bionic]  [no wallet]'
+#      env: >-
+#        HOST=x86_64-unknown-linux-gnu
+#        PACKAGES="python3-zmq"
+#        DEP_OPTS="NO_WALLET=1"
+#        GOAL="install"
+#        BITCOIN_CONFIG="--enable-glibc-back-compat --enable-reduce-exports"
+
+    - stage: test
+      name: 'macOS 10.10  [GOAL: deploy] [no functional tests]'
+      env: >-
+        HOST=x86_64-apple-darwin14
+        PACKAGES="cmake imagemagick libcap-dev librsvg2-bin libz-dev libbz2-dev libtiff-tools python3-dev python3-setuptools"
+        OSX_SDK=10.11
+        RUN_UNIT_TESTS=false
+        RUN_FUNCTIONAL_TESTS=false
+        GOAL="deploy"
+        BITCOIN_CONFIG="--enable-gui --enable-reduce-exports --enable-werror"
diff --git a/.travis/README.md b/.travis/README.md
new file mode 100644
index 0000000..c837e19
--- /dev/null
+++ b/.travis/README.md
@@ -0,0 +1,7 @@
+## travis build scripts
+
+The `.travis` directory contains scripts for each build step in each build stage.
+Currently the travis build defines two stages `lint` and `test`. Each stage has
+it's own [lifecycle](https://docs.travis-ci.com/user/customizing-the-build/#the-build-lifecycle).
+Every script in here is named and numbered according to which stage and lifecycle
+step it belongs to.
diff --git a/.travis/lint_04_install.sh b/.travis/lint_04_install.sh
new file mode 100755
index 0000000..9a22773
--- /dev/null
+++ b/.travis/lint_04_install.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2018 The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+export LC_ALL=C
+
+travis_retry pip install codespell==1.13.0
+travis_retry pip install flake8==3.5.0
+travis_retry pip install vulture==0.29
+
+SHELLCHECK_VERSION=v0.6.0
+curl -s "https://storage.googleapis.com/shellcheck/shellcheck-${SHELLCHECK_VERSION}.linux.x86_64.tar.xz" | tar --xz -xf - --directory /tmp/
+export PATH="/tmp/shellcheck-${SHELLCHECK_VERSION}:${PATH}"
diff --git a/.travis/lint_05_before_script.sh b/.travis/lint_05_before_script.sh
new file mode 100755
index 0000000..5a4aab1
--- /dev/null
+++ b/.travis/lint_05_before_script.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2018 The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+export LC_ALL=C
+
+#git fetch --unshallow
diff --git a/.travis/lint_06_script.sh b/.travis/lint_06_script.sh
new file mode 100755
index 0000000..deabc13
--- /dev/null
+++ b/.travis/lint_06_script.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2018 The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+export LC_ALL=C
+
+
+#contrib/devtools/git-subtree-check.sh src/secp256k1
+#contrib/devtools/git-subtree-check.sh src/univalue
+#contrib/devtools/git-subtree-check.sh src/leveldb
+contrib/devtools/check-doc.py
+contrib/devtools/logprint-scanner.py
+
+if [ "$TRAVIS_EVENT_TYPE" = "pull_request" ]; then
+  contrib/devtools/lint-whitespace.sh
+fi
diff --git a/.travis/test_03_before_install.sh b/.travis/test_03_before_install.sh
new file mode 100755
index 0000000..16d3154
--- /dev/null
+++ b/.travis/test_03_before_install.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2018 The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+export LC_ALL=C.UTF-8
+
+PATH=$(echo $PATH | tr ':' "\n" | sed '/\/opt\/python/d' | tr "\n" ":" | sed "s|::|:|g")
+# Add llvm-symbolizer directory to PATH. Needed to get symbolized stack traces from the sanitizers.
+PATH=$PATH:/usr/lib/llvm-6.0/bin/
+export PATH
+
+BEGIN_FOLD () {
+  echo ""
+  CURRENT_FOLD_NAME=$1
+  echo "travis_fold:start:${CURRENT_FOLD_NAME}"
+}
+
+END_FOLD () {
+  RET=$?
+  echo "travis_fold:end:${CURRENT_FOLD_NAME}"
+  if [ $RET != 0 ]; then
+    echo "${CURRENT_FOLD_NAME} failed with status code ${RET}"
+  fi
+}
diff --git a/.travis/test_04_install.sh b/.travis/test_04_install.sh
new file mode 100755
index 0000000..451e921
--- /dev/null
+++ b/.travis/test_04_install.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2018 The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+export LC_ALL=C.UTF-8
+
+travis_retry docker pull "$DOCKER_NAME_TAG"
+env | grep -E '^(BITCOIN_CONFIG|CCACHE_|WINEDEBUG|LC_ALL|BOOST_TEST_RANDOM|CONFIG_SHELL)' | tee /tmp/env
+if [[ $HOST = *-mingw32 ]]; then
+  DOCKER_ADMIN="--cap-add SYS_ADMIN"
+elif [[ $BITCOIN_CONFIG = *--with-sanitizers=*address* ]]; then # If ran with (ASan + LSan), Docker needs access to ptrace (https://github.com/google/sanitizers/issues/764)
+  DOCKER_ADMIN="--cap-add SYS_PTRACE"
+fi
+DOCKER_ID=$(docker run $DOCKER_ADMIN -idt --mount type=bind,src=$TRAVIS_BUILD_DIR,dst=$TRAVIS_BUILD_DIR --mount type=bind,src=$CCACHE_DIR,dst=$CCACHE_DIR -w $TRAVIS_BUILD_DIR --env-file /tmp/env $DOCKER_NAME_TAG)
+
+DOCKER_EXEC () {
+  docker exec $DOCKER_ID bash -c "cd $PWD && $*"
+}
+
+if [ -n "$DPKG_ADD_ARCH" ]; then
+  DOCKER_EXEC dpkg --add-architecture "$DPKG_ADD_ARCH"
+fi
+
+travis_retry DOCKER_EXEC apt-get update
+travis_retry DOCKER_EXEC apt-get install --no-install-recommends --no-upgrade -qq $PACKAGES $DOCKER_PACKAGES
diff --git a/.travis/test_05_before_script.sh b/.travis/test_05_before_script.sh
new file mode 100755
index 0000000..64a3223
--- /dev/null
+++ b/.travis/test_05_before_script.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2018 The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+export LC_ALL=C.UTF-8
+
+#DOCKER_EXEC echo \> \$HOME/.simplicity  # Make sure default datadir does not exist and is never read by creating a dummy file
+
+mkdir -p depends/SDKs depends/sdk-sources
+
+if [ -n "$OSX_SDK" -a ! -f depends/sdk-sources/MacOSX${OSX_SDK}.sdk.tar.gz ]; then
+  curl --location --fail $SDK_URL/MacOSX${OSX_SDK}.sdk.tar.gz -o depends/sdk-sources/MacOSX${OSX_SDK}.sdk.tar.gz
+fi
+if [ -n "$OSX_SDK" -a -f depends/sdk-sources/MacOSX${OSX_SDK}.sdk.tar.gz ]; then
+  tar -C depends/SDKs -xf depends/sdk-sources/MacOSX${OSX_SDK}.sdk.tar.gz
+fi
+if [[ $HOST = *-mingw32 ]]; then
+  DOCKER_EXEC update-alternatives --set $HOST-g++ \$\(which $HOST-g++-posix\)
+fi
+if [ -z "$NO_DEPENDS" ]; then
+  DOCKER_EXEC CONFIG_SHELL= make $MAKEJOBS -C depends HOST=$HOST $DEP_OPTS
+fi
diff --git a/.travis/test_06_script_a.sh b/.travis/test_06_script_a.sh
new file mode 100755
index 0000000..89eabf1
--- /dev/null
+++ b/.travis/test_06_script_a.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2018 The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+export LC_ALL=C.UTF-8
+
+TRAVIS_COMMIT_LOG=$(git log --format=fuller -1)
+export TRAVIS_COMMIT_LOG
+
+OUTDIR=$BASE_OUTDIR/$TRAVIS_PULL_REQUEST/$TRAVIS_JOB_NUMBER-$HOST
+BITCOIN_CONFIG_ALL="--disable-dependency-tracking --prefix=$TRAVIS_BUILD_DIR/depends/$HOST --bindir=$OUTDIR/bin --libdir=$OUTDIR/lib"
+if [ -z "$NO_DEPENDS" ]; then
+  DOCKER_EXEC ccache --max-size=$CCACHE_SIZE
+fi
+
+BEGIN_FOLD autogen
+if [ -n "$CONFIG_SHELL" ]; then
+  DOCKER_EXEC "$CONFIG_SHELL" -c "./autogen.sh"
+else
+  DOCKER_EXEC ./autogen.sh
+fi
+END_FOLD
+
+mkdir build
+cd build || (echo "could not enter build directory"; exit 1)
+
+BEGIN_FOLD configure
+DOCKER_EXEC ../configure --cache-file=config.cache $BITCOIN_CONFIG_ALL $BITCOIN_CONFIG || ( cat config.log && false)
+END_FOLD
+
+BEGIN_FOLD distdir
+DOCKER_EXEC make distdir VERSION=$HOST
+END_FOLD
+
+cd "simplicity-$HOST" || (echo "could not enter distdir simplicity-$HOST"; exit 1)
+
+BEGIN_FOLD configure
+DOCKER_EXEC ./configure --cache-file=../config.cache $BITCOIN_CONFIG_ALL $BITCOIN_CONFIG || ( cat config.log && false)
+END_FOLD
+
+set -o errtrace
+trap 'DOCKER_EXEC "cat ${TRAVIS_BUILD_DIR}/sanitizer-output/* 2> /dev/null"' ERR
+
+BEGIN_FOLD build
+DOCKER_EXEC make $MAKEJOBS $GOAL || ( echo "Build failure. Verbose build follows." && DOCKER_EXEC make $GOAL V=1 ; false )
+END_FOLD
+
+cd ${TRAVIS_BUILD_DIR} || (echo "could not enter travis build dir $TRAVIS_BUILD_DIR"; exit 1)
diff --git a/.travis/test_06_script_b.sh b/.travis/test_06_script_b.sh
new file mode 100755
index 0000000..6bacf99
--- /dev/null
+++ b/.travis/test_06_script_b.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2018 The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+export LC_ALL=C.UTF-8
+
+cd "build/simplicity-$HOST" || (echo "could not enter distdir build/simplicity-$HOST"; exit 1)
+
+if [ "$RUN_UNIT_TESTS" = "true" ]; then
+  BEGIN_FOLD unit-tests
+  DOCKER_EXEC LD_LIBRARY_PATH=$TRAVIS_BUILD_DIR/depends/$HOST/lib make $MAKEJOBS check VERBOSE=1
+  END_FOLD
+fi
+
+if [ "$RUN_FUNCTIONAL_TESTS" = "true" ]; then
+  BEGIN_FOLD functional-tests
+  DOCKER_EXEC test/functional/test_runner.py --combinedlogslen=4000 ${TEST_RUNNER_EXTRA}
+  END_FOLD
+fi
+
+cd ${TRAVIS_BUILD_DIR} || (echo "could not enter travis build dir $TRAVIS_BUILD_DIR"; exit 1)
diff --git a/README.md b/README.md
index f16e1a0..8ef614b 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 Simplicity integration/staging repository
 =====================================
 
-[![Build Status](https://travis-ci.org/Simplicity-Project/Simplicity.svg?branch=master)](https://travis-ci.org/Simplicity-Project/Simplicity) [![GitHub version](https://badge.fury.io/gh/Simplicity-Project%2FSimplicity.svg)](https://badge.fury.io/gh/Simplicity-Project%2FSimplicity)
+[![Build Status](https://travis-ci.com/simplicity-coin/simplicity.svg?branch=master)](https://travis-ci.com/simplicity-coin/simplicity) [![GitHub version](https://badge.fury.io/gh/simplicity-coin%2Fsimplicity.svg)](https://badge.fury.io/gh/simplicity-coin%2Fsimplicity)
 
 Simplicity is an open source crypto-currency focused on fast transactions, with low transaction fees & environmental footprint. It utilizes multi algo PoW, PoS, and several masternode tiers for securing its network. The goal of Simplicity is to achieve a decentralized sustainable crypto currency with near instant transactions, fair governance and community intelligence.
 - Fast transactions featuring guaranteed zero confirmation transactions, we call it _SwiftX_.
diff --git a/configure.ac b/configure.ac
index 86938f3..d8567bd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@ AC_PREREQ([2.60])
 define(_CLIENT_VERSION_MAJOR, 2)
 define(_CLIENT_VERSION_MINOR, 0)
 define(_CLIENT_VERSION_REVISION, 1)
-define(_CLIENT_VERSION_BUILD, 1)
+define(_CLIENT_VERSION_BUILD, 2)
 define(_CLIENT_VERSION_IS_RELEASE, true)
 define(_COPYRIGHT_YEAR, 2019)
 AC_INIT([Simplicity],[_CLIENT_VERSION_MAJOR._CLIENT_VERSION_MINOR._CLIENT_VERSION_REVISION],[www.simplicity.org],[simplicity])
@@ -58,6 +58,9 @@ case $host in
   *mingw*)
      lt_cv_deplibs_check_method="pass_all"
   ;;
+  *aarch64*)
+     CFLAGS="$CFLAGS -march=armv8.1-a+crypto"
+  ;;
 esac
 dnl Require C++11 compiler (no GNU extensions)
 AX_CXX_COMPILE_STDCXX([11], [noext], [mandatory], [nodefault])
@@ -189,7 +192,6 @@ AC_ARG_ENABLE([asm],
 
 if test "x$use_asm" = xyes; then
   AC_DEFINE(USE_ASM, 1, [Define this symbol to build in assembly routines])
-  AC_DEFINE(USE_XOP, 1, [Define this symbol to enable optimization])
 fi
 
 AC_ARG_WITH([system-univalue],
@@ -326,7 +328,9 @@ fi
 # compatibility.
 AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]])
 AX_CHECK_COMPILE_FLAG([-msse4.1],[[SSE41_CXXFLAGS="-msse4.1"]],,[[$CXXFLAG_WERROR]])
-AX_CHECK_COMPILE_FLAG([-mavx -mavx2],[[AVX2_CXXFLAGS="-mavx -mavx2"]],,[[$CXXFLAG_WERROR]])
+AX_CHECK_COMPILE_FLAG([-mavx],[[AVX_CXXFLAGS="-mavx"]],,[[$CXXFLAG_WERROR]])
+AX_CHECK_COMPILE_FLAG([-mxop],[[XOP_CXXFLAGS="-mxop"]],,[[$CXXFLAG_WERROR]])
+AX_CHECK_COMPILE_FLAG([-mavx2],[[AVX2_CXXFLAGS="-mavx2"]],,[[$CXXFLAG_WERROR]])
 AX_CHECK_COMPILE_FLAG([-msse4 -msha],[[SHANI_CXXFLAGS="-msse4 -msha"]],,[[$CXXFLAG_WERROR]])
 
 TEMP_CXXFLAGS="$CXXFLAGS"
@@ -366,6 +370,30 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
 )
 CXXFLAGS="$TEMP_CXXFLAGS"
 
+TEMP_CXXFLAGS="$CXXFLAGS"
+CXXFLAGS="$CXXFLAGS $AVX_CXXFLAGS"
+AC_MSG_CHECKING(for AVX intrinsics)
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+    #include <stdint.h>
+    #include <immintrin.h>
+  ]],[[
+    __m256i l = _mm256_set1_epi32(0);
+    return _mm256_extract_epi32(l, 7);
+  ]])],
+ [ AC_MSG_RESULT(yes); enable_avx=yes; AC_DEFINE(ENABLE_AVX, 1, [Define this symbol to build code that uses AVX intrinsics]) ],
+ [ AC_MSG_RESULT(no)]
+)
+CXXFLAGS="$TEMP_CXXFLAGS"
+
+TEMP_CXXFLAGS="$CXXFLAGS"
+CXXFLAGS="$CXXFLAGS $XOP_CXXFLAGS"
+AC_MSG_CHECKING(for XOP intrinsics)
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([asm ("vprotd \$7, %xmm0, %xmm1");])],
+ [ AC_MSG_RESULT(yes); enable_xop=yes; AC_DEFINE(ENABLE_XOP, 1, [Define this symbol to build code that uses XOP intrinsics]) ],
+ [ AC_MSG_RESULT(no)]
+)
+CXXFLAGS="$TEMP_CXXFLAGS"
+
 TEMP_CXXFLAGS="$CXXFLAGS"
 CXXFLAGS="$CXXFLAGS $AVX2_CXXFLAGS"
 AC_MSG_CHECKING(for AVX2 intrinsics)
@@ -374,7 +402,8 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
     #include <immintrin.h>
   ]],[[
     __m256i l = _mm256_set1_epi32(0);
-    return _mm256_extract_epi32(l, 7);
+    __m256i m = _mm256_add_epi32(l, l);
+    return _mm256_extract_epi32(m, 7);
   ]])],
  [ AC_MSG_RESULT(yes); enable_avx2=yes; AC_DEFINE(ENABLE_AVX2, 1, [Define this symbol to build code that uses AVX2 intrinsics]) ],
  [ AC_MSG_RESULT(no)]
@@ -1334,6 +1363,8 @@ AM_CONDITIONAL([GLIBC_BACK_COMPAT],[test x$use_glibc_compat = xyes])
 AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes])
 AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes])
 AM_CONDITIONAL([ENABLE_SSE41],[test x$enable_sse41 = xyes])
+AM_CONDITIONAL([ENABLE_AVX],[test x$enable_avx = xyes])
+AM_CONDITIONAL([ENABLE_XOP],[test x$enable_xop = xyes])
 AM_CONDITIONAL([ENABLE_AVX2],[test x$enable_avx2 = xyes])
 AM_CONDITIONAL([ENABLE_SHANI],[test x$enable_shani = xyes])
 AM_CONDITIONAL([USE_ASM],[test x$use_asm = xyes])
@@ -1373,6 +1404,8 @@ AC_SUBST(SANITIZER_CXXFLAGS)
 AC_SUBST(SANITIZER_LDFLAGS)
 AC_SUBST(SSE42_CXXFLAGS)
 AC_SUBST(SSE41_CXXFLAGS)
+AC_SUBST(AVX_CXXFLAGS)
+AC_SUBST(XOP_CXXFLAGS)
 AC_SUBST(AVX2_CXXFLAGS)
 AC_SUBST(SHANI_CXXFLAGS)
 AC_SUBST(LIBTOOL_APP_LDFLAGS)
diff --git a/src/Makefile.am b/src/Makefile.am
index 17e0101..21010c7 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -297,13 +297,15 @@ crypto_libbitcoin_crypto_a_SOURCES = \
   crypto/rfc6979_hmac_sha256.cpp \
   crypto/hmac_sha512.cpp \
   crypto/scrypt.cpp \
-  crypto/scrypt2.cpp \
-  crypto/scrypt-arm.S \
-  crypto/scrypt-x64.S \
-  crypto/scrypt-x86.S \
   crypto/sha2-arm.S \
+  crypto/sha2-armv8.c \
   crypto/sha2-x64.S \
   crypto/sha2-x86.S \
+  crypto/scrypt-arm.S \
+  crypto/scrypt-armv8.c \
+  crypto/scrypt-x64.S \
+  crypto/scrypt-x86.S \
+  crypto/scrypt_opt.cpp \
   crypto/ripemd160.cpp \
   crypto/aes_helper.c \
   crypto/blake.c \
@@ -319,7 +321,7 @@ crypto_libbitcoin_crypto_a_SOURCES = \
   crypto/rfc6979_hmac_sha256.h \
   crypto/hmac_sha512.h \
   crypto/scrypt.h \
-  crypto/scrypt2.h \
+  crypto/scrypt_opt.h \
   crypto/sha1.h \
   crypto/ripemd160.h \
   crypto/sph_blake.h \
diff --git a/src/activemasternode.cpp b/src/activemasternode.cpp
index fd17141..2f6dc3b 100644
--- a/src/activemasternode.cpp
+++ b/src/activemasternode.cpp
@@ -473,9 +473,17 @@ std::vector<COutput> CActiveMasternode::SelectCoinsMasternode()
     }
 
     // Filter
-    for (const COutput& out : vCoins) {
-        if (CMasternode::IsDepositCoins(out.tx->vout[out.i].nValue)) {
-            filteredCoins.push_back(out);
+    if (IsSporkActive(SPORK_18_NEW_MASTERNODE_TIERS)) {
+        for (const COutput& out : vCoins) {
+            if (CMasternode::IsDepositCoins(out.tx->vout[out.i].nValue)) {
+                filteredCoins.push_back(out);
+            }
+        }
+    } else {
+        for (const COutput& out : vCoins) {
+            if (CMasternode::Level(out.tx->vout[out.i].nValue, chainActive.Height()) == 3u) {
+                filteredCoins.push_back(out);
+            }
         }
     }
     return filteredCoins;
diff --git a/src/chain.h b/src/chain.h
index 48c2870..4889985 100644
--- a/src/chain.h
+++ b/src/chain.h
@@ -171,7 +171,7 @@ class CBlockIndex
     uint256 nStakeModifierV2;
 
     //! block header
-    int nVersion;
+    unsigned int nVersion;
     uint256 hashMerkleRoot;
     unsigned int nTime;
     unsigned int nBits;
@@ -395,7 +395,7 @@ class CBlockIndex
      * in the last Params().ToCheckBlockUpgradeMajority() blocks, starting at pstart
      * and going backwards.
      */
-    static bool IsSuperMajority(int minVersion, const CBlockIndex* pstart, unsigned int nRequired);
+    static bool IsSuperMajority(unsigned int minVersion, const CBlockIndex* pstart, unsigned int nRequired);
 
     std::string ToString() const
     {
diff --git a/src/chainparams.cpp b/src/chainparams.cpp
index 2a2c001..6744eb9 100644
--- a/src/chainparams.cpp
+++ b/src/chainparams.cpp
@@ -70,11 +70,13 @@ static Checkpoints::MapCheckpoints mapCheckpoints =
     (650000, uint256("cf6076eda981af1097e52f505c0c3dfefc60af9b93ba674fda8e22235ff50df3"))
     (700000, uint256("2f852dfbc9b767905400c0e706ff63eca8d5e4090d4d49f873f9be1a754cc243"))
     (950000, uint256("0cbb4dfac570e6cbf7aa10b3f8a138b3dda3e908ab78e301f12354731bbce560"))
-    (1030000, uint256("6435fc65c4b7dc50bf254124884d7787c99451b2fe8b604c5a8435849beba1f5"));
+    (1030000, uint256("6435fc65c4b7dc50bf254124884d7787c99451b2fe8b604c5a8435849beba1f5"))
+    (1040000, uint256("a8212fbda825a42ecd3a0d1251437626fbde53afc1ea4eea76d05b4898718a0f"))
+    (1050000, uint256("9ab97fa25881e95b4c22fb7515d90738054a43231231acf8d5fc3be581591192"));
 static const Checkpoints::CCheckpointData data = {
     &mapCheckpoints,
-    1573624156, // * UNIX timestamp of last checkpoint block
-    1710219,    // * total number of transactions between genesis and last checkpoint
+    1575775743, // * UNIX timestamp of last checkpoint block
+    1743384,    // * total number of transactions between genesis and last checkpoint
                 //   (the tx=... number in the SetBestChain debug.log lines)
     2000        // * estimated number of transactions per day after checkpoint
 };
@@ -231,6 +233,9 @@ class CMainParams : public CChainParams
         genesis.nBits = 0x1f00ffff;
         genesis.nNonce = 561379;
 
+        //uint256 hashTarget = uint256().SetCompact(genesis.nBits);
+        //assert(genesis.GetPoWHash() <= hashTarget);
+
         hashGenesisBlock = genesis.GetHash();
         assert(genesis.hashMerkleRoot == uint256("0x40bdd3d5ae84b91a71190094a82948400eb3356e87c5376b64d79509cf552d84"));
         assert(hashGenesisBlock == uint256("0xf4bbfc518aa3622dbeb8d2818a606b82c2b8b1ac2f28553ebdb6fc04d7abaccf"));
@@ -246,7 +251,7 @@ class CMainParams : public CChainParams
         base58Prefixes[EXT_PUBLIC_KEY] = boost::assign::list_of(0x04)(0x44)(0xD5)(0xBC).convert_to_container<std::vector<unsigned char> >();
         base58Prefixes[EXT_SECRET_KEY] = boost::assign::list_of(0x04)(0x44)(0xF0)(0xA3).convert_to_container<std::vector<unsigned char> >();
         // BIP44 coin type is from https://github.com/satoshilabs/slips/blob/master/slip-0044.md
-        base58Prefixes[EXT_COIN_TYPE] = boost::assign::list_of(0x80)(0x00)(0x00)(0x77).convert_to_container<std::vector<unsigned char> >();
+        base58Prefixes[EXT_COIN_TYPE] = boost::assign::list_of(0x80)(0x00)(0x01)(0xc0).convert_to_container<std::vector<unsigned char> >();
 
         convertSeed6(vFixedSeeds, pnSeed6_main, ARRAYLEN(pnSeed6_main));
 
@@ -263,7 +268,7 @@ class CMainParams : public CChainParams
         nBudgetCycleBlocks = 30 * 24 * 60 * 60 / nTargetSpacing; //!< Amount of blocks in a months period of time (using 1 minutes per) = (60*24*30)
         strSporkKey = "03fdfa718ec40be6ce1b5fadf36022a4f0ff2f1efc872291ffbe42af127bdd2859";
         strSporkKeyOld = "03fdfa718ec40be6ce1b5fadf36022a4f0ff2f1efc872291ffbe42af127bdd2859";
-        strObfuscationPoolDummyAddress = "D87q2gC9j6nNrnzCsg4aY6bHMLsT9nUhEw";
+        strObfuscationPoolDummyAddress = "8JLdPguDU5HJkBvjfca7vD79wXvksPvJMz";
         nStartMasternodePayments = 1403728576; //Wed, 25 Jun 2014 20:36:16 GMT
 
         /** Zerocoin */
@@ -311,8 +316,8 @@ class CTestNetParams : public CMainParams
         pchMessageStart[3] = 0xc6;
         vAlertPubKey = ParseHex("03b95000b2b06e391c058ea14d47ac3c525753c68460864f254ada5a63e27a8134");
         nDefaultPort = 21957;
-        bnProofOfWorkLimit[POW_QUARK] = ~uint256(0) >> 12;
-        bnProofOfWorkLimit[POW_SCRYPT_SQUARED] = ~uint256(0) >> 8;
+        bnProofOfWorkLimit[POW_QUARK] = ~uint256(0) >> 16;
+        bnProofOfWorkLimit[POW_SCRYPT_SQUARED] = ~uint256(0) >> 9;
         nEnforceBlockUpgradeMajority = 3780; // 70%
         nRejectBlockOutdatedMajority = 4050; // 75%
         nToCheckBlockUpgradeMajority = 5400; // 4 days (1350*4)
@@ -356,15 +361,18 @@ class CTestNetParams : public CMainParams
         //vBurnAddresses.emplace_back("xzd3LKsihYn1CKBESTQP7EresFECXEMivk");
 
         //! Modify the testnet genesis block so the timestamp is valid for a later start.
-        //genesis.nTime = 1454124731;
+        genesis.nTime = 1574924400;
         genesis.nBits = 0x1f00ffff;
-        genesis.nNonce = 93481;
+        genesis.nNonce = 164084;
+
+        uint256 hashTarget = uint256().SetCompact(genesis.nBits);
+        assert(genesis.GetPoWHash() <= hashTarget);
 
         hashGenesisBlock = genesis.GetHash();
         //printf("Merkle hash test: %s\n", genesis.hashMerkleRoot.ToString().c_str());
         //printf("Block hash test: %s\n", hashGenesisBlock.ToString().c_str());
         assert(genesis.hashMerkleRoot == uint256("0x40bdd3d5ae84b91a71190094a82948400eb3356e87c5376b64d79509cf552d84"));
-        assert(hashGenesisBlock == uint256("0xfcfc1b5bc930bc0a74643462617264e4f7aa39e276c637353bda6960b5726fb8"));
+        assert(hashGenesisBlock == uint256("0x000037a145d6812571b0c413d868a43146d7159056afe7a06b344e9ee0de39fc"));
 
         vFixedSeeds.clear();
         vSeeds.clear();
@@ -373,10 +381,8 @@ class CTestNetParams : public CMainParams
         base58Prefixes[PUBKEY_ADDRESS] = std::vector<unsigned char>(1, 139); // Testnet simplicity addresses start with 'x' or 'y'
         base58Prefixes[SCRIPT_ADDRESS] = std::vector<unsigned char>(1, 19);  // Testnet simplicity script addresses start with '8' or '9'
         base58Prefixes[SECRET_KEY] = std::vector<unsigned char>(1, 239);     // Testnet private keys start with '9' or 'c' (Bitcoin defaults)
-        // Testnet simplicity BIP32 pubkeys start with 'DRKV'
-        base58Prefixes[EXT_PUBLIC_KEY] = boost::assign::list_of(0x3a)(0x80)(0x61)(0xa0).convert_to_container<std::vector<unsigned char> >();
-        // Testnet simplicity BIP32 prvkeys start with 'DRKP'
-        base58Prefixes[EXT_SECRET_KEY] = boost::assign::list_of(0x3a)(0x80)(0x58)(0x37).convert_to_container<std::vector<unsigned char> >();
+        base58Prefixes[EXT_PUBLIC_KEY] = boost::assign::list_of(0x05)(0x55)(0xCF)(0xB1).convert_to_container<std::vector<unsigned char> >();
+        base58Prefixes[EXT_SECRET_KEY] = boost::assign::list_of(0x05)(0x55)(0xD4)(0x7A).convert_to_container<std::vector<unsigned char> >();
         // Testnet simplicity BIP44 coin type is '1' (All coin's testnet default)
         base58Prefixes[EXT_COIN_TYPE] = boost::assign::list_of(0x80)(0x00)(0x00)(0x01).convert_to_container<std::vector<unsigned char> >();
 
@@ -393,7 +399,7 @@ class CTestNetParams : public CMainParams
         nBudgetCycleBlocks = 24 * 6 * 60 / nTargetSpacing; //!< Ten cycles per day on testnet
         strSporkKey = "03b95000b2b06e391c058ea14d47ac3c525753c68460864f254ada5a63e27a8134";
         strSporkKeyOld = "03b95000b2b06e391c058ea14d47ac3c525753c68460864f254ada5a63e27a8134";
-        strObfuscationPoolDummyAddress = "y6S5YPwPCXi2oemSRJGitNPwPjcFJfwbED";
+        strObfuscationPoolDummyAddress = "yCQuB8kvJYRJyRFDJXrzVgVyfe2E68S8jb";
         nStartMasternodePayments = 1420837558; //Fri, 09 Jan 2015 21:05:58 GMT
         nBudget_Fee_Confirmations = 3; // Number of confirmations for the finalization fee. We have to make this very short
                                        // here because we only have a 8 block finalization window on testnet
@@ -448,15 +454,27 @@ class CRegTestParams : public CTestNetParams
         nPublicZCSpends = 350;
 
         //! Modify the regtest genesis block so the timestamp is valid for a later start.
-        genesis.nTime = 1454124731;
-        genesis.nBits = 0x207fffff;
-        genesis.nNonce = 12345;
+        genesis.nTime = 1574924400;
+        genesis.nBits = 0x1f00ffff;
+        genesis.nNonce = 164084;
+
+        uint256 hashTarget = uint256().SetCompact(genesis.nBits);
+        /*while (true) {
+            uint256 hash = genesis.GetPoWHash();
+            if (hash <= hashTarget) {
+                // Found a solution
+                printf("genesis block found\n   hash: %s\n target: %s\n  nonce: %i\n", hash.ToString().c_str(), hashTarget.ToString().c_str(), genesis.nNonce);
+                break;
+            }
+            genesis.nNonce += 1;
+        }*/
+        assert(genesis.GetPoWHash() <= hashTarget);
 
         hashGenesisBlock = genesis.GetHash();
         //printf("Merkle hash reg: %s\n", genesis.hashMerkleRoot.ToString().c_str());
         //printf("Block hash reg: %s\n", hashGenesisBlock.ToString().c_str());
         assert(genesis.hashMerkleRoot == uint256("0x40bdd3d5ae84b91a71190094a82948400eb3356e87c5376b64d79509cf552d84"));
-        assert(hashGenesisBlock == uint256("0xf38094d1d2ccb97e06248813ee887b48ee7326d68cb663ebf59eeca7a0bde2c4"));
+        assert(hashGenesisBlock == uint256("0x000037a145d6812571b0c413d868a43146d7159056afe7a06b344e9ee0de39fc"));
 
         vFixedSeeds.clear(); //! Testnet mode doesn't have any fixed seeds.
         vSeeds.clear();      //! Testnet mode doesn't have any DNS seeds.
diff --git a/src/chainparams.h b/src/chainparams.h
index 3d0c640..bf43bd5 100644
--- a/src/chainparams.h
+++ b/src/chainparams.h
@@ -63,7 +63,7 @@ class CChainParams
     const CBlock& GenesisBlock() const { return genesis; }
     /** Make miner wait to have peers to avoid wasting work */
     bool MiningRequiresPeers() const { return fMiningRequiresPeers; }
-    /** Headers first syncing is disabled */
+    /** Headers first syncing is enabled */
     bool HeadersFirstSyncingActive() const { return fHeadersFirstSyncingActive; };
     /** Default value for -checkmempool and -checkblockindex argument */
     bool DefaultConsistencyChecks() const { return fDefaultConsistencyChecks; }
@@ -137,7 +137,7 @@ class CChainParams
     /** Height or Time Based Activations **/
     int ModifierUpgradeBlock() const { return nModifierUpdateBlock; }
     int WALLET_UPGRADE_BLOCK() const { return nMandatoryUpgradeBlock; }
-    int WALLET_UPGRADE_VERSION() const { return nUpgradeBlockVersion; }
+    uint32_t WALLET_UPGRADE_VERSION() const { return nUpgradeBlockVersion; }
     uint32_t BadScryptDiffTimeStart() const { return nBadScryptDiffTimeStart; }
     uint32_t BadScryptDiffTimeEnd() const { return nBadScryptDiffTimeEnd; }
     int Zerocoin_StartHeight() const { return nZerocoinStartHeight; }
@@ -175,7 +175,7 @@ class CChainParams
     int64_t nTargetTimespan;
     int64_t nTargetSpacing;
     int nMandatoryUpgradeBlock;
-    int nUpgradeBlockVersion;
+    uint32_t nUpgradeBlockVersion;
     uint32_t nBadScryptDiffTimeStart;
     uint32_t nBadScryptDiffTimeEnd;
     int nMasternodeCountDrift;
diff --git a/src/crypter.cpp b/src/crypter.cpp
index 0c3253e..fd2e8d7 100644
--- a/src/crypter.cpp
+++ b/src/crypter.cpp
@@ -29,7 +29,7 @@ bool CCrypter::SetKeyFromPassphrase(const SecureString& strKeyData, const std::v
 
     if (nDerivationMethod == 1) {
         // Passphrase conversion
-        uint256 scryptHash = scrypt_salted_multiround_hash((const void*)strKeyData.c_str(), strKeyData.size(), &chSalt[0], 8, nRounds);
+        uint256 scryptHash = scrypt_salted_multiround_hash((const void*)strKeyData.c_str(), strKeyData.size(), &chSalt[0], WALLET_CRYPTO_SALT_SIZE, nRounds);
 
         i = EVP_BytesToKey(EVP_aes_256_cbc(), EVP_sha512(), &chSalt[0], (unsigned char *)&scryptHash, sizeof scryptHash, nRounds, chKey, chIV);
         memory_cleanse(&scryptHash, sizeof scryptHash);
diff --git a/src/crypto/scrypt-arm.S b/src/crypto/scrypt-arm.S
index 7e38639..528181f 100644
--- a/src/crypto/scrypt-arm.S
+++ b/src/crypto/scrypt-arm.S
@@ -11,7 +11,7 @@
 #include "config/simplicity-config.h"
 #endif
 
-#if defined(__arm__) && defined(__APCS_32__)
+#if /*defined(USE_ASM) &&*/ defined(__arm__) && defined(__APCS_32__)
 
 #if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
     defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \
diff --git a/src/crypto/scrypt-armv8.c b/src/crypto/scrypt-armv8.c
new file mode 100644
index 0000000..ec4e045
--- /dev/null
+++ b/src/crypto/scrypt-armv8.c
@@ -0,0 +1,1213 @@
+/*
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2014 pooler
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+#if defined(__aarch64__)
+
+#include <arm_neon.h>
+#include <endian.h>
+
+static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
+{
+    uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
+    int i;
+
+    x00 = (B[ 0] ^= Bx[ 0]);
+    x01 = (B[ 1] ^= Bx[ 1]);
+    x02 = (B[ 2] ^= Bx[ 2]);
+    x03 = (B[ 3] ^= Bx[ 3]);
+    x04 = (B[ 4] ^= Bx[ 4]);
+    x05 = (B[ 5] ^= Bx[ 5]);
+    x06 = (B[ 6] ^= Bx[ 6]);
+    x07 = (B[ 7] ^= Bx[ 7]);
+    x08 = (B[ 8] ^= Bx[ 8]);
+    x09 = (B[ 9] ^= Bx[ 9]);
+    x10 = (B[10] ^= Bx[10]);
+    x11 = (B[11] ^= Bx[11]);
+    x12 = (B[12] ^= Bx[12]);
+    x13 = (B[13] ^= Bx[13]);
+    x14 = (B[14] ^= Bx[14]);
+    x15 = (B[15] ^= Bx[15]);
+    for (i = 0; i < 8; i += 2) {
+#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+        /* Operate on columns. */
+        x04 ^= R(x00+x12, 7);   x09 ^= R(x05+x01, 7);
+        x14 ^= R(x10+x06, 7);   x03 ^= R(x15+x11, 7);
+
+        x08 ^= R(x04+x00, 9);   x13 ^= R(x09+x05, 9);
+        x02 ^= R(x14+x10, 9);   x07 ^= R(x03+x15, 9);
+
+        x12 ^= R(x08+x04,13);   x01 ^= R(x13+x09,13);
+        x06 ^= R(x02+x14,13);   x11 ^= R(x07+x03,13);
+
+        x00 ^= R(x12+x08,18);   x05 ^= R(x01+x13,18);
+        x10 ^= R(x06+x02,18);   x15 ^= R(x11+x07,18);
+
+        /* Operate on rows. */
+        x01 ^= R(x00+x03, 7);   x06 ^= R(x05+x04, 7);
+        x11 ^= R(x10+x09, 7);   x12 ^= R(x15+x14, 7);
+
+        x02 ^= R(x01+x00, 9);   x07 ^= R(x06+x05, 9);
+        x08 ^= R(x11+x10, 9);   x13 ^= R(x12+x15, 9);
+
+        x03 ^= R(x02+x01,13);   x04 ^= R(x07+x06,13);
+        x09 ^= R(x08+x11,13);   x14 ^= R(x13+x12,13);
+
+        x00 ^= R(x03+x02,18);   x05 ^= R(x04+x07,18);
+        x10 ^= R(x09+x08,18);   x15 ^= R(x14+x13,18);
+#undef R
+    }
+    B[ 0] += x00;
+    B[ 1] += x01;
+    B[ 2] += x02;
+    B[ 3] += x03;
+    B[ 4] += x04;
+    B[ 5] += x05;
+    B[ 6] += x06;
+    B[ 7] += x07;
+    B[ 8] += x08;
+    B[ 9] += x09;
+    B[10] += x10;
+    B[11] += x11;
+    B[12] += x12;
+    B[13] += x13;
+    B[14] += x14;
+    B[15] += x15;
+}
+
+static inline void xor_salsa8_prefetch(uint32_t B[16], const uint32_t Bx[16], uint32_t* V, uint32_t N)
+{
+    uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
+    int i;
+
+    x00 = (B[ 0] ^= Bx[ 0]);
+    x01 = (B[ 1] ^= Bx[ 1]);
+    x02 = (B[ 2] ^= Bx[ 2]);
+    x03 = (B[ 3] ^= Bx[ 3]);
+    x04 = (B[ 4] ^= Bx[ 4]);
+    x05 = (B[ 5] ^= Bx[ 5]);
+    x06 = (B[ 6] ^= Bx[ 6]);
+    x07 = (B[ 7] ^= Bx[ 7]);
+    x08 = (B[ 8] ^= Bx[ 8]);
+    x09 = (B[ 9] ^= Bx[ 9]);
+    x10 = (B[10] ^= Bx[10]);
+    x11 = (B[11] ^= Bx[11]);
+    x12 = (B[12] ^= Bx[12]);
+    x13 = (B[13] ^= Bx[13]);
+    x14 = (B[14] ^= Bx[14]);
+    x15 = (B[15] ^= Bx[15]);
+    for (i = 0; i < 8; i += 2) {
+#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+        /* Operate on columns. */
+        x04 ^= R(x00+x12, 7);   x09 ^= R(x05+x01, 7);
+        x14 ^= R(x10+x06, 7);   x03 ^= R(x15+x11, 7);
+
+        x08 ^= R(x04+x00, 9);   x13 ^= R(x09+x05, 9);
+        x02 ^= R(x14+x10, 9);   x07 ^= R(x03+x15, 9);
+
+        x12 ^= R(x08+x04,13);   x01 ^= R(x13+x09,13);
+        x06 ^= R(x02+x14,13);   x11 ^= R(x07+x03,13);
+
+        x00 ^= R(x12+x08,18);   x05 ^= R(x01+x13,18);
+        x10 ^= R(x06+x02,18);   x15 ^= R(x11+x07,18);
+
+        /* Operate on rows. */
+        x01 ^= R(x00+x03, 7);   x06 ^= R(x05+x04, 7);
+        x11 ^= R(x10+x09, 7);   x12 ^= R(x15+x14, 7);
+
+        x02 ^= R(x01+x00, 9);   x07 ^= R(x06+x05, 9);
+        x08 ^= R(x11+x10, 9);   x13 ^= R(x12+x15, 9);
+
+        x03 ^= R(x02+x01,13);   x04 ^= R(x07+x06,13);
+        x09 ^= R(x08+x11,13);   x14 ^= R(x13+x12,13);
+
+        x00 ^= R(x03+x02,18);   x05 ^= R(x04+x07,18);
+        x10 ^= R(x09+x08,18);   x15 ^= R(x14+x13,18);
+#undef R
+    }
+    B[ 0] += x00;
+    uint32_t one = 32 * (B[0] & (N - 1));
+    __builtin_prefetch(&V[one + 0]);
+    __builtin_prefetch(&V[one + 8]);
+    __builtin_prefetch(&V[one + 16]);
+    __builtin_prefetch(&V[one + 24]);
+    asm("":::"memory");
+    B[ 1] += x01;
+    B[ 2] += x02;
+    B[ 3] += x03;
+    B[ 4] += x04;
+    B[ 5] += x05;
+    B[ 6] += x06;
+    B[ 7] += x07;
+    B[ 8] += x08;
+    B[ 9] += x09;
+    B[10] += x10;
+    B[11] += x11;
+    B[12] += x12;
+    B[13] += x13;
+    B[14] += x14;
+    B[15] += x15;
+}
+
+void scrypt_core(uint32_t *X, uint32_t *V, int N)
+{
+    int i;
+
+    for (i = 0; i < N; i++) {
+        memcpy(&V[i * 32], X, 128);
+        xor_salsa8(&X[0], &X[16]);
+        xor_salsa8(&X[16], &X[0]);
+    }
+    for (i = 0; i < N; i++) {
+        uint32_t j = 32 * (X[16] & (N - 1));
+        for (uint8_t k = 0; k < 32; k++)
+            X[k] ^= V[j + k];
+        xor_salsa8(&X[0], &X[16]);
+        xor_salsa8_prefetch(&X[16], &X[0], V, N);
+    }
+}
+
+static inline void scrypt_shuffle(uint32_t B[16])
+{
+    uint32_t x0 =   B[0];
+    uint32_t x1 =   B[1];
+    uint32_t x2 =   B[2];
+    uint32_t x3 =   B[3];
+    uint32_t x4 =   B[4];
+    uint32_t x5 =   B[5];
+    uint32_t x6 =   B[6];
+    uint32_t x7 =   B[7];
+    uint32_t x8 =   B[8];
+    uint32_t x9 =   B[9];
+    uint32_t x10 = B[10];
+    uint32_t x11 = B[11];
+    uint32_t x12 = B[12];
+    uint32_t x13 = B[13];
+    uint32_t x14 = B[14];
+    uint32_t x15 = B[15];
+
+    B[0] = x0;  B[1] = x5;  B[2] = x10;  B[3] = x15;
+    B[4] = x12; B[5] = x1;  B[6] = x6;   B[7] = x11;
+    B[8] = x8;  B[9] = x13; B[10] = x2;  B[11] = x7;
+    B[12] = x4; B[13] = x9; B[14] = x14; B[15] = x3;
+}
+
+void scrypt_core_3way(uint32_t *X, uint32_t *V, int N)
+{
+    uint32_t* W = V;
+
+    scrypt_shuffle(&X[0  + 0]);
+    scrypt_shuffle(&X[16 + 0]);
+    scrypt_shuffle(&X[0 + 32]);
+    scrypt_shuffle(&X[16 + 32]);
+    scrypt_shuffle(&X[0 + 64]);
+    scrypt_shuffle(&X[16 + 64]);
+
+    uint32x4x4_t q_a, q_b, q_c, q_tmp;
+    uint32x4x4_t ba_a, bb_a, bc_a, ba_b, bb_b, bc_b;
+
+    ba_a.val[0] = vld1q_u32(&X[( 0) / 4]);
+    ba_a.val[1] = vld1q_u32(&X[(16) / 4]);
+    ba_a.val[2] = vld1q_u32(&X[(32) / 4]);
+    ba_a.val[3] = vld1q_u32(&X[(48) / 4]);
+    ba_b.val[0] = vld1q_u32(&X[(0 + 64 + 0) / 4]);
+    ba_b.val[1] = vld1q_u32(&X[(0 + 64 + 16) / 4]);
+    ba_b.val[2] = vld1q_u32(&X[(0 + 64 + 32) / 4]);
+    ba_b.val[3] = vld1q_u32(&X[(0 + 64 + 48) / 4]);
+
+    bb_a.val[0] = vld1q_u32(&X[(128 +  0) / 4]);
+    bb_a.val[1] = vld1q_u32(&X[(128 + 16) / 4]);
+    bb_a.val[2] = vld1q_u32(&X[(128 + 32) / 4]);
+    bb_a.val[3] = vld1q_u32(&X[(128 + 48) / 4]);
+    bb_b.val[0] = vld1q_u32(&X[(128 + 64 + 0) / 4]);
+    bb_b.val[1] = vld1q_u32(&X[(128 + 64 + 16) / 4]);
+    bb_b.val[2] = vld1q_u32(&X[(128 + 64 + 32) / 4]);
+    bb_b.val[3] = vld1q_u32(&X[(128 + 64 + 48) / 4]);
+
+    bc_a.val[0] = vld1q_u32(&X[(256 + 0) / 4]);
+    bc_a.val[1] = vld1q_u32(&X[(256 + 16) / 4]);
+    bc_a.val[2] = vld1q_u32(&X[(256 + 32) / 4]);
+    bc_a.val[3] = vld1q_u32(&X[(256 + 48) / 4]);
+    bc_b.val[0] = vld1q_u32(&X[(256 + 64 + 0) / 4]);
+    bc_b.val[1] = vld1q_u32(&X[(256 + 64 + 16) / 4]);
+    bc_b.val[2] = vld1q_u32(&X[(256 + 64 + 32) / 4]);
+    bc_b.val[3] = vld1q_u32(&X[(256 + 64 + 48) / 4]);
+
+    // prep
+
+    vst1q_u32(&V[( 0) / 4], ba_a.val[0]);
+    vst1q_u32(&V[(16) / 4], ba_a.val[1]);
+    vst1q_u32(&V[(32) / 4], ba_a.val[2]);
+    vst1q_u32(&V[(48) / 4], ba_a.val[3]);
+    vst1q_u32(&V[(64) / 4],  ba_b.val[0]);
+    vst1q_u32(&V[(80) / 4],  ba_b.val[1]);
+    vst1q_u32(&V[(96) / 4],  ba_b.val[2]);
+    vst1q_u32(&V[(112) / 4], ba_b.val[3]);
+
+    vst1q_u32(&V[(128 +  0) / 4], bb_a.val[0]);
+    vst1q_u32(&V[(128 + 16) / 4], bb_a.val[1]);
+    vst1q_u32(&V[(128 + 32) / 4], bb_a.val[2]);
+    vst1q_u32(&V[(128 + 48) / 4], bb_a.val[3]);
+    vst1q_u32(&V[(128 + 64) / 4],  bb_b.val[0]);
+    vst1q_u32(&V[(128 + 80) / 4],  bb_b.val[1]);
+    vst1q_u32(&V[(128 + 96) / 4],  bb_b.val[2]);
+    vst1q_u32(&V[(128 + 112) / 4], bb_b.val[3]);
+
+    vst1q_u32(&V[(256 +  0) / 4], bc_a.val[0]);
+    vst1q_u32(&V[(256 + 16) / 4], bc_a.val[1]);
+    vst1q_u32(&V[(256 + 32) / 4], bc_a.val[2]);
+    vst1q_u32(&V[(256 + 48) / 4], bc_a.val[3]);
+    vst1q_u32(&V[(256 + 64) / 4], bc_b.val[0]);
+    vst1q_u32(&V[(256 + 80) / 4], bc_b.val[1]);
+    vst1q_u32(&V[(256 + 96) / 4], bc_b.val[2]);
+    vst1q_u32(&V[(256 + 112) / 4],bc_b.val[3]);
+
+    V += 96;
+
+    for (int n = 0; n < N; n++)
+    {
+        // loop 1 part a
+        q_a.val[0] = veorq_u32(ba_b.val[0], ba_a.val[0]);
+        q_a.val[1] = veorq_u32(ba_b.val[1], ba_a.val[1]);
+        q_a.val[2] = veorq_u32(ba_b.val[2], ba_a.val[2]);
+        q_a.val[3] = veorq_u32(ba_b.val[3], ba_a.val[3]);
+
+        q_b.val[0] = veorq_u32(bb_b.val[0], bb_a.val[0]);
+        q_b.val[1] = veorq_u32(bb_b.val[1], bb_a.val[1]);
+        q_b.val[2] = veorq_u32(bb_b.val[2], bb_a.val[2]);
+        q_b.val[3] = veorq_u32(bb_b.val[3], bb_a.val[3]);
+
+        q_c.val[0] = veorq_u32(bc_b.val[0], bc_a.val[0]);
+        q_c.val[1] = veorq_u32(bc_b.val[1], bc_a.val[1]);
+        q_c.val[2] = veorq_u32(bc_b.val[2], bc_a.val[2]);
+        q_c.val[3] = veorq_u32(bc_b.val[3], bc_a.val[3]);
+
+        ba_a = q_a;
+        bb_a = q_b;
+        bc_a = q_c;
+
+        for (int i = 0; i < 4; i ++)
+        {
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 3);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 3);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 3);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 1);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 1);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 1);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+            q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 3);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 3);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 3);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 1);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 1);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 1);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+        }
+        ba_a.val[0] = vaddq_u32(ba_a.val[0], q_a.val[0]);
+        ba_a.val[1] = vaddq_u32(ba_a.val[1], q_a.val[1]);
+        ba_a.val[2] = vaddq_u32(ba_a.val[2], q_a.val[2]);
+        ba_a.val[3] = vaddq_u32(ba_a.val[3], q_a.val[3]);
+
+        q_a = ba_a;
+
+        bb_a.val[0] = vaddq_u32(bb_a.val[0], q_b.val[0]);
+        bb_a.val[1] = vaddq_u32(bb_a.val[1], q_b.val[1]);
+        bb_a.val[2] = vaddq_u32(bb_a.val[2], q_b.val[2]);
+        bb_a.val[3] = vaddq_u32(bb_a.val[3], q_b.val[3]);
+
+        q_b = bb_a;
+
+        bc_a.val[0] = vaddq_u32(bc_a.val[0], q_c.val[0]);
+        bc_a.val[1] = vaddq_u32(bc_a.val[1], q_c.val[1]);
+        bc_a.val[2] = vaddq_u32(bc_a.val[2], q_c.val[2]);
+        bc_a.val[3] = vaddq_u32(bc_a.val[3], q_c.val[3]);
+
+        q_c = bc_a;
+
+        for (int i = 0; i < 4; i++)
+        {
+            vst1q_u32(&V[      (i * 4) ], ba_a.val[i]);
+            vst1q_u32(&V[(32 + (i * 4))], bb_a.val[i]);
+            vst1q_u32(&V[(64 + (i * 4))], bc_a.val[i]);
+        }
+
+        // loop 1 part b
+
+        q_a.val[0] = veorq_u32(ba_b.val[0], q_a.val[0]);
+        q_a.val[1] = veorq_u32(ba_b.val[1], q_a.val[1]);
+        q_a.val[2] = veorq_u32(ba_b.val[2], q_a.val[2]);
+        q_a.val[3] = veorq_u32(ba_b.val[3], q_a.val[3]);
+        ba_b = q_a;
+
+        q_b.val[0] = veorq_u32(bb_b.val[0], q_b.val[0]);
+        q_b.val[1] = veorq_u32(bb_b.val[1], q_b.val[1]);
+        q_b.val[2] = veorq_u32(bb_b.val[2], q_b.val[2]);
+        q_b.val[3] = veorq_u32(bb_b.val[3], q_b.val[3]);
+        bb_b = q_b;
+
+        q_c.val[0] = veorq_u32(bc_b.val[0], q_c.val[0]);
+        q_c.val[1] = veorq_u32(bc_b.val[1], q_c.val[1]);
+        q_c.val[2] = veorq_u32(bc_b.val[2], q_c.val[2]);
+        q_c.val[3] = veorq_u32(bc_b.val[3], q_c.val[3]);
+        bc_b = q_c;
+
+
+        for (int i = 0; i < 4; i ++)
+        {
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 3);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 3);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 3);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 1);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 1);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 1);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+            q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 3);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 3);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 3);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 1);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 1);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 1);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+        }
+
+        ba_b.val[0] = vaddq_u32(q_a.val[0], ba_b.val[0]);
+        ba_b.val[1] = vaddq_u32(q_a.val[1], ba_b.val[1]);
+        ba_b.val[2] = vaddq_u32(q_a.val[2], ba_b.val[2]);
+        ba_b.val[3] = vaddq_u32(q_a.val[3], ba_b.val[3]);
+        bb_b.val[0] = vaddq_u32(q_b.val[0], bb_b.val[0]);
+        bb_b.val[1] = vaddq_u32(q_b.val[1], bb_b.val[1]);
+        bb_b.val[2] = vaddq_u32(q_b.val[2], bb_b.val[2]);
+        bb_b.val[3] = vaddq_u32(q_b.val[3], bb_b.val[3]);
+        bc_b.val[0] = vaddq_u32(q_c.val[0], bc_b.val[0]);
+        bc_b.val[1] = vaddq_u32(q_c.val[1], bc_b.val[1]);
+        bc_b.val[2] = vaddq_u32(q_c.val[2], bc_b.val[2]);
+        bc_b.val[3] = vaddq_u32(q_c.val[3], bc_b.val[3]);
+        for (int i = 0; i < 4; i++)
+        {
+            vst1q_u32(&V[(     16 + (i * 4))], ba_b.val[i]);
+            vst1q_u32(&V[(32 + 16 + (i * 4))], bb_b.val[i]);
+            vst1q_u32(&V[(64 + 16 + (i * 4))], bc_b.val[i]);
+        }
+        V += 96;
+    }
+    V = W;
+
+    // loop 2
+
+    uint32x4x4_t x;
+
+    uint32_t one =   32 * (3 * (ba_b.val[0][0] & (N - 1)) + 0);
+    uint32_t two =   32 * (3 * (bb_b.val[0][0] & (N - 1)) + 1);
+    uint32_t three = 32 * (3 * (bc_b.val[0][0] & (N - 1)) + 2);
+    x.val[0] = vld1q_u32(&W[one +  0]);
+    x.val[1] = vld1q_u32(&W[one +  4]);
+    x.val[2] = vld1q_u32(&W[one +  8]);
+    x.val[3] = vld1q_u32(&W[one + 12]);
+
+    for (int n = 0; n < N; n++)
+    {
+        // loop 2 part a
+
+        ba_a.val[0] = veorq_u32(ba_a.val[0], x.val[0]);
+            x.val[0] = vld1q_u32(&W[one + 16 +  0]);
+        ba_a.val[1] = veorq_u32(ba_a.val[1], x.val[1]);
+            x.val[1] = vld1q_u32(&W[one + 16 +  4]);
+        ba_a.val[2] = veorq_u32(ba_a.val[2], x.val[2]);
+            x.val[2] = vld1q_u32(&W[one + 16 +  8]);
+        ba_a.val[3] = veorq_u32(ba_a.val[3], x.val[3]);
+
+            ba_b.val[0] = veorq_u32(ba_b.val[0], x.val[0]);
+            ba_b.val[1] = veorq_u32(ba_b.val[1], x.val[1]);
+            x.val[3] = vld1q_u32(&W[one + 16 + 12]);
+            ba_b.val[2] = veorq_u32(ba_b.val[2], x.val[2]);
+            ba_b.val[3] = veorq_u32(ba_b.val[3], x.val[3]);
+        x.val[0] = vld1q_u32(&W[two +  0]);
+                q_a.val[0] = veorq_u32(ba_b.val[0], ba_a.val[0]);
+                q_a.val[1] = veorq_u32(ba_b.val[1], ba_a.val[1]);
+        x.val[1] = vld1q_u32(&W[two +  4]);
+                q_a.val[2] = veorq_u32(ba_b.val[2], ba_a.val[2]);
+                q_a.val[3] = veorq_u32(ba_b.val[3], ba_a.val[3]);
+        x.val[2] = vld1q_u32(&W[two +  8]);
+        ba_a = q_a;
+
+        x.val[3] = vld1q_u32(&W[two + 12]);
+
+        bb_a.val[0] = veorq_u32(bb_a.val[0], x.val[0]);
+            x.val[0] = vld1q_u32(&W[two + 16 +  0]);
+        bb_a.val[1] = veorq_u32(bb_a.val[1], x.val[1]);
+            x.val[1] = vld1q_u32(&W[two + 16 +  4]);
+        bb_a.val[2] = veorq_u32(bb_a.val[2], x.val[2]);
+            x.val[2] = vld1q_u32(&W[two + 16 +  8]);
+        bb_a.val[3] = veorq_u32(bb_a.val[3], x.val[3]);
+            bb_b.val[0] = veorq_u32(bb_b.val[0], x.val[0]);
+            x.val[3] = vld1q_u32(&W[two + 16 + 12]);
+            bb_b.val[1] = veorq_u32(bb_b.val[1], x.val[1]);
+        x.val[0] = vld1q_u32(&W[three +  0]);
+            bb_b.val[2] = veorq_u32(bb_b.val[2], x.val[2]);
+            bb_b.val[3] = veorq_u32(bb_b.val[3], x.val[3]);
+        x.val[1] = vld1q_u32(&W[three +  4]);
+                q_b.val[0] = veorq_u32(bb_b.val[0], bb_a.val[0]);
+                q_b.val[1] = veorq_u32(bb_b.val[1], bb_a.val[1]);
+        x.val[2] = vld1q_u32(&W[three +  8]);
+                q_b.val[2] = veorq_u32(bb_b.val[2], bb_a.val[2]);
+                q_b.val[3] = veorq_u32(bb_b.val[3], bb_a.val[3]);
+        x.val[3] = vld1q_u32(&W[three + 12]);
+        bb_a = q_b;
+
+        bc_a.val[0] = veorq_u32(bc_a.val[0], x.val[0]);
+            x.val[0] = vld1q_u32(&W[three + 16 +  0]);
+        bc_a.val[1] = veorq_u32(bc_a.val[1], x.val[1]);
+            x.val[1] = vld1q_u32(&W[three + 16 +  4]);
+        bc_a.val[2] = veorq_u32(bc_a.val[2], x.val[2]);
+            x.val[2] = vld1q_u32(&W[three + 16 +  8]);
+        bc_a.val[3] = veorq_u32(bc_a.val[3], x.val[3]);
+            bc_b.val[0] = veorq_u32(bc_b.val[0], x.val[0]);
+            x.val[3] = vld1q_u32(&W[three + 16 + 12]);
+            bc_b.val[1] = veorq_u32(bc_b.val[1], x.val[1]);
+            bc_b.val[2] = veorq_u32(bc_b.val[2], x.val[2]);
+            bc_b.val[3] = veorq_u32(bc_b.val[3], x.val[3]);
+                q_c.val[0] = veorq_u32(bc_b.val[0], bc_a.val[0]);
+                q_c.val[1] = veorq_u32(bc_b.val[1], bc_a.val[1]);
+                q_c.val[2] = veorq_u32(bc_b.val[2], bc_a.val[2]);
+                q_c.val[3] = veorq_u32(bc_b.val[3], bc_a.val[3]);
+        bc_a = q_c;
+
+        for (int i = 0; i < 4; i++)
+        {
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 3);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 3);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 3);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 1);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 1);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 1);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+            q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 3);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 3);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 3);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 1);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 1);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 1);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+        }
+        ba_a.val[0] = vaddq_u32(ba_a.val[0], q_a.val[0]);
+        ba_a.val[1] = vaddq_u32(ba_a.val[1], q_a.val[1]);
+        ba_a.val[2] = vaddq_u32(ba_a.val[2], q_a.val[2]);
+        ba_a.val[3] = vaddq_u32(ba_a.val[3], q_a.val[3]);
+
+        q_a = ba_a;
+
+        bb_a.val[0] = vaddq_u32(bb_a.val[0], q_b.val[0]);
+        bb_a.val[1] = vaddq_u32(bb_a.val[1], q_b.val[1]);
+        bb_a.val[2] = vaddq_u32(bb_a.val[2], q_b.val[2]);
+        bb_a.val[3] = vaddq_u32(bb_a.val[3], q_b.val[3]);
+        q_b = bb_a;
+
+        bc_a.val[0] = vaddq_u32(bc_a.val[0], q_c.val[0]);
+        bc_a.val[1] = vaddq_u32(bc_a.val[1], q_c.val[1]);
+        bc_a.val[2] = vaddq_u32(bc_a.val[2], q_c.val[2]);
+        bc_a.val[3] = vaddq_u32(bc_a.val[3], q_c.val[3]);
+        q_c = bc_a;
+
+        // loop 2 b
+
+        q_a.val[0] = veorq_u32(ba_b.val[0], q_a.val[0]);
+        q_a.val[1] = veorq_u32(ba_b.val[1], q_a.val[1]);
+        q_a.val[2] = veorq_u32(ba_b.val[2], q_a.val[2]);
+        q_a.val[3] = veorq_u32(ba_b.val[3], q_a.val[3]);
+        ba_b = q_a;
+
+        q_b.val[0] = veorq_u32(bb_b.val[0], q_b.val[0]);
+        q_b.val[1] = veorq_u32(bb_b.val[1], q_b.val[1]);
+        q_b.val[2] = veorq_u32(bb_b.val[2], q_b.val[2]);
+        q_b.val[3] = veorq_u32(bb_b.val[3], q_b.val[3]);
+        bb_b = q_b;
+
+        q_c.val[0] = veorq_u32(bc_b.val[0], q_c.val[0]);
+        q_c.val[1] = veorq_u32(bc_b.val[1], q_c.val[1]);
+        q_c.val[2] = veorq_u32(bc_b.val[2], q_c.val[2]);
+        q_c.val[3] = veorq_u32(bc_b.val[3], q_c.val[3]);
+        bc_b = q_c;
+
+
+        for (int i = 0; i < 3; i++)
+        {
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 3);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 3);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 3);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 1);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 1);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 1);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+            q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 3);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 3);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 3);
+
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 1);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 1);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 1);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+        }
+        {
+            //1
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+            //2
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+            //3
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 3);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 3);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 3);
+            //4
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+            //5
+            q_tmp.val[0] = vaddq_u32(q_a.val[0], q_a.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 7);
+            q_tmp.val[2] = vaddq_u32(q_b.val[0], q_b.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 25);
+            q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 1);
+            q_a.val[1] = veorq_u32(q_tmp.val[1], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 7);
+            q_tmp.val[3] = vaddq_u32(q_c.val[0], q_c.val[3]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 25);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 1);
+            q_b.val[1] = veorq_u32(q_tmp.val[1], q_b.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 7);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 25);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 1);
+            q_c.val[1] = veorq_u32(q_tmp.val[1], q_c.val[1]);
+            //6
+            q_tmp.val[0] = vaddq_u32(q_a.val[1], q_a.val[0]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 9);
+            q_tmp.val[2] = vaddq_u32(q_b.val[1], q_b.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 23);
+            q_a.val[2] = veorq_u32(q_tmp.val[1], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 9);
+            q_tmp.val[3] = vaddq_u32(q_c.val[1], q_c.val[0]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 23);
+            q_b.val[2] = veorq_u32(q_tmp.val[1], q_b.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 9);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 23);
+            q_c.val[2] = veorq_u32(q_tmp.val[1], q_c.val[2]);
+            //7
+            q_tmp.val[0] = vaddq_u32(q_a.val[2], q_a.val[1]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 13);
+            q_tmp.val[2] = vaddq_u32(q_b.val[2], q_b.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 19);
+            q_a.val[3] = veorq_u32(q_tmp.val[1], q_a.val[3]);
+                q_a.val[1] = vextq_u32(q_a.val[1], q_a.val[1], 3);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 13);
+            q_tmp.val[3] = vaddq_u32(q_c.val[2], q_c.val[1]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 19);
+            q_b.val[3] = veorq_u32(q_tmp.val[1], q_b.val[3]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 13);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 19);
+            q_c.val[3] = veorq_u32(q_tmp.val[1], q_c.val[3]);
+            q_b.val[1] = vextq_u32(q_b.val[1], q_b.val[1], 3);
+            q_c.val[1] = vextq_u32(q_c.val[1], q_c.val[1], 3);
+
+            //8
+            q_tmp.val[0] = vaddq_u32(q_a.val[3], q_a.val[2]);
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[0], 18);
+            q_tmp.val[2] = vaddq_u32(q_b.val[3], q_b.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[0], 14);
+            q_a.val[0] = veorq_u32(q_tmp.val[1], q_a.val[0]);
+                ba_b.val[0] = vaddq_u32(q_a.val[0], ba_b.val[0]);
+                    one =   32 * (3 * (ba_b.val[0][0] & (N - 1)) + 0);
+                    __builtin_prefetch(&W[one + 0]);
+                    __builtin_prefetch(&W[one + 8]);
+                    __builtin_prefetch(&W[one + 16]);
+                    __builtin_prefetch(&W[one + 24]);
+
+            q_a.val[2] = vextq_u32(q_a.val[2], q_a.val[2], 2);
+            q_b.val[2] = vextq_u32(q_b.val[2], q_b.val[2], 2);
+
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[2], 18);
+            q_tmp.val[3] = vaddq_u32(q_c.val[3], q_c.val[2]);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[2], 14);
+            q_c.val[2] = vextq_u32(q_c.val[2], q_c.val[2], 2);
+            q_b.val[3] = vextq_u32(q_b.val[3], q_b.val[3], 1);
+            q_b.val[0] = veorq_u32(q_tmp.val[1], q_b.val[0]);
+                bb_b.val[0] = vaddq_u32(q_b.val[0], bb_b.val[0]);
+                    two =   32 * (3 * (bb_b.val[0][0] & (N - 1)) + 1);
+                    __builtin_prefetch(&W[two + 0]);
+                    __builtin_prefetch(&W[two + 8]);
+                    __builtin_prefetch(&W[two + 16]);
+                    __builtin_prefetch(&W[two + 24]);
+
+            q_tmp.val[1] = vshlq_n_u32(q_tmp.val[3], 18);
+            q_tmp.val[1] = vsriq_n_u32(q_tmp.val[1], q_tmp.val[3], 14);
+            q_a.val[3] = vextq_u32(q_a.val[3], q_a.val[3], 1);
+            q_c.val[3] = vextq_u32(q_c.val[3], q_c.val[3], 1);
+            q_c.val[0] = veorq_u32(q_tmp.val[1], q_c.val[0]);
+                bc_b.val[0] = vaddq_u32(q_c.val[0], bc_b.val[0]);
+                    three = 32 * (3 * (bc_b.val[0][0] & (N - 1)) + 2);
+                    __builtin_prefetch(&W[three + 0]);
+                    __builtin_prefetch(&W[three + 8]);
+                    __builtin_prefetch(&W[three + 16]);
+                    __builtin_prefetch(&W[three + 24]);
+        }
+
+        x.val[0] = vld1q_u32(&W[one +  0]);
+        ba_b.val[1] = vaddq_u32(q_a.val[1], ba_b.val[1]);
+        ba_b.val[2] = vaddq_u32(q_a.val[2], ba_b.val[2]);
+        ba_b.val[3] = vaddq_u32(q_a.val[3], ba_b.val[3]);
+        x.val[1] = vld1q_u32(&W[one +  4]);
+        bb_b.val[1] = vaddq_u32(q_b.val[1], bb_b.val[1]);
+        bb_b.val[2] = vaddq_u32(q_b.val[2], bb_b.val[2]);
+        bb_b.val[3] = vaddq_u32(q_b.val[3], bb_b.val[3]);
+        x.val[2] = vld1q_u32(&W[one +  8]);
+        bc_b.val[1] = vaddq_u32(q_c.val[1], bc_b.val[1]);
+        bc_b.val[2] = vaddq_u32(q_c.val[2], bc_b.val[2]);
+        bc_b.val[3] = vaddq_u32(q_c.val[3], bc_b.val[3]);
+        x.val[3] = vld1q_u32(&W[one + 12]);
+    }
+
+    vst1q_u32(&X[0],       ba_a.val[0]);
+    vst1q_u32(&X[4],       ba_a.val[1]);
+    vst1q_u32(&X[8],       ba_a.val[2]);
+    vst1q_u32(&X[12],      ba_a.val[3]);
+    vst1q_u32(&X[16 + 0],  ba_b.val[0]);
+    vst1q_u32(&X[16 + 4],  ba_b.val[1]);
+    vst1q_u32(&X[16 + 8],  ba_b.val[2]);
+    vst1q_u32(&X[16 + 12], ba_b.val[3]);
+
+    vst1q_u32(&X[32 + 0],       bb_a.val[0]);
+    vst1q_u32(&X[32 + 4],       bb_a.val[1]);
+    vst1q_u32(&X[32 + 8],       bb_a.val[2]);
+    vst1q_u32(&X[32 + 12],      bb_a.val[3]);
+    vst1q_u32(&X[32 + 16 + 0],  bb_b.val[0]);
+    vst1q_u32(&X[32 + 16 + 4],  bb_b.val[1]);
+    vst1q_u32(&X[32 + 16 + 8],  bb_b.val[2]);
+    vst1q_u32(&X[32 + 16 + 12], bb_b.val[3]);
+
+    vst1q_u32(&X[64 + 0],       bc_a.val[0]);
+    vst1q_u32(&X[64 + 4],       bc_a.val[1]);
+    vst1q_u32(&X[64 + 8],       bc_a.val[2]);
+    vst1q_u32(&X[64 + 12],      bc_a.val[3]);
+    vst1q_u32(&X[64 + 16 + 0],  bc_b.val[0]);
+    vst1q_u32(&X[64 + 16 + 4],  bc_b.val[1]);
+    vst1q_u32(&X[64 + 16 + 8],  bc_b.val[2]);
+    vst1q_u32(&X[64 + 16 + 12], bc_b.val[3]);
+
+    scrypt_shuffle(&X[0  + 0]);
+    scrypt_shuffle(&X[16 + 0]);
+    scrypt_shuffle(&X[0 + 32]);
+    scrypt_shuffle(&X[16 + 32]);
+    scrypt_shuffle(&X[0 + 64]);
+    scrypt_shuffle(&X[16 + 64]);
+}
+#endif
diff --git a/src/crypto/scrypt-x64.S b/src/crypto/scrypt-x64.S
index 11afd15..31dd2e2 100644
--- a/src/crypto/scrypt-x64.S
+++ b/src/crypto/scrypt-x64.S
@@ -38,7 +38,7 @@
     .section .note.GNU-stack,"",%progbits
 #endif
 
-#if defined(__x86_64__)
+#if /*defined(USE_ASM) &&*/ defined(__x86_64__)
 
     .text
     .p2align 6
@@ -2224,9 +2224,9 @@ scrypt_core_xmm_loop2:
     ret
 
 
-#if defined(ENABLE_AVX2)
+#if defined(ENABLE_AVX)
 
-#endif /* USE_AVX */
+#endif /* ENABLE_AVX */
 
     .text
     .p2align 6
@@ -2258,7 +2258,7 @@ _scrypt_core_3way:
     subq    $392, %rsp
 
 
-#if !defined(ENABLE_AVX2)
+#if !defined(ENABLE_AVX)
     jmp scrypt_core_3way_xmm
 #else
     /* Check for AVX and OSXSAVE support */
@@ -2273,7 +2273,7 @@ _scrypt_core_3way:
     andl    $0x00000006, %eax
     cmpl    $0x00000006, %eax
     jne scrypt_core_3way_xmm
-#if defined(USE_XOP)
+#if defined(ENABLE_XOP)
     /* Check for XOP support */
     movl    $0x80000001, %eax
     cpuid
@@ -5311,7 +5311,7 @@ scrypt_core_3way_avx_loop2:
     popq    %rbx
     ret
 
-#if defined(USE_XOP)
+#if defined(ENABLE_XOP)
 
 
     .p2align 6
@@ -7575,8 +7575,8 @@ scrypt_core_3way_xop_loop2:
     popq    %rbp
     popq    %rbx
     ret
-#endif /* USE_XOP */
-#endif /* USE_AVX */
+#endif /* ENABLE_XOP */
+#endif /* ENABLE_AVX */
 
 
 
@@ -14401,6 +14401,6 @@ scrypt_core_6way_avx2_loop2:
     popq    %rbx
     ret
 
-#endif /* USE_AVX2 */
+#endif /* ENABLE_AVX2 */
 
 #endif
diff --git a/src/crypto/scrypt-x86.S b/src/crypto/scrypt-x86.S
index 9ad7519..4ab86ef 100644
--- a/src/crypto/scrypt-x86.S
+++ b/src/crypto/scrypt-x86.S
@@ -32,7 +32,7 @@
     .section .note.GNU-stack,"",%progbits
 #endif
 
-#if defined(__i386__)
+#if /*defined(USE_ASM) &&*/ defined(__i386__)
 
 
 
diff --git a/src/crypto/scrypt.cpp b/src/crypto/scrypt.cpp
index 42be66b..0f05068 100644
--- a/src/crypto/scrypt.cpp
+++ b/src/crypto/scrypt.cpp
@@ -28,6 +28,7 @@
  */
 
 #include "crypto/scrypt.h"
+#include "crypto/scrypt_opt.h"
 #include "uint256.h"
 #include "utilstrencodings.h"
 #include <openssl/sha.h>
@@ -363,22 +364,27 @@ void scrypt(const char* pass, unsigned int pLen, const char* salt, unsigned int
 {
     //containers
     void* V0 = malloc(128 * r * N + 63);
-    void* XY0 = malloc(256 * r + 64 + 63);
     void* B1 = malloc(128 * r * p + 63);
-    uint8_t* B = (uint8_t *)(((uintptr_t)(B1) + 63) & ~ (uintptr_t)(63));
-    uint32_t* V = (uint32_t *)(((uintptr_t)(V0) + 63) & ~ (uintptr_t)(63));
-    uint32_t* XY = (uint32_t *)(((uintptr_t)(XY0) + 63) & ~ (uintptr_t)(63));
+    uint8_t* B = (uint8_t *)(((uintptr_t)(B1) + 63) & ~(uintptr_t)(63));
+    uint32_t* V = (uint32_t *)(((uintptr_t)(V0) + 63) & ~(uintptr_t)(63));
 
     PBKDF2_SHA256((const uint8_t *)pass, pLen, (const uint8_t *)salt, sLen, 1, B, p * 128 * r);
 
-    for(unsigned int i = 0; i < p; i++)
-    {
-        SMix(&B[i * 128 * r], r, N, V, XY);
+    if (r == 1 && p == 1) {
+        scrypt_core((uint32_t*)B, V, N);
+    } else {
+        void* XY0 = malloc(256 * r + 64 + 63);
+        uint32_t* XY = (uint32_t *)(((uintptr_t)(XY0) + 63) & ~(uintptr_t)(63));
+
+        for (unsigned int i = 0; i < p; i++) {
+            SMix(&B[i * 128 * r], r, N, V, XY);
+        }
+
+        free(XY0);
     }
 
     PBKDF2_SHA256((const uint8_t *)pass, pLen, B, p * 128 * r, 1, (uint8_t *)output, dkLen);
 
     free(V0);
-    free(XY0);
     free(B1);
 }
diff --git a/src/crypto/scrypt2.cpp b/src/crypto/scrypt_opt.cpp
similarity index 93%
rename from src/crypto/scrypt2.cpp
rename to src/crypto/scrypt_opt.cpp
index e37fca4..e08badd 100644
--- a/src/crypto/scrypt2.cpp
+++ b/src/crypto/scrypt_opt.cpp
@@ -27,11 +27,40 @@
  * online backup system.
  */
 
-#include "crypto/scrypt2.h"
+#include "crypto/scrypt_opt.h"
 #include "compat.h"
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
+#include <stdio.h>
+
+static bool HAVE_AVX2 = false;
+
+#if defined(__x86_64__)
+static inline void __attribute__((constructor)) check_avx2()
+{
+    int a, b, c, d, AVX_mask = (1<<28) | (1<<26) | (1<<27);
+    asm volatile("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(1), "c"(0)); //avx
+    if ((c & AVX_mask) == AVX_mask) {
+        printf("Have AVX\n");
+    } else {
+        printf("Do not have AVX\n");
+    }
+
+    asm volatile("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(7), "c"(0)); //avx2
+    if (b & (1<<5)) {
+        HAVE_AVX2 = true;
+        printf("Have AVX2\n");
+    } else {
+        printf("Do not have AVX2\n");
+    }
+}
+#elif defined(__ARM_NEON)
+static inline void __attribute__((constructor)) display_neon()
+{
+    printf("Have NEON\n");
+}
+#endif
 
 static const uint32_t sha256_h[8] = {
     0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
@@ -44,7 +73,6 @@ void sha256_init(uint32_t *state)
 }
 
 #if defined(__i386__)
-
 static const uint32_t sha256_k[64] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
@@ -564,22 +592,22 @@ static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
 
 #ifndef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
+#define SCRYPT_BEST_THROUGHPUT 1
 #endif
 
-unsigned char *scrypt_buffer_alloc(int N)
+unsigned char *scrypt_buffer_alloc(int N, bool multiWay)
 {
-    return (unsigned char*)malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63);
+    return (unsigned char*)malloc((size_t)N * (multiWay ? (HAVE_AVX2 ? 2 * SCRYPT_MAX_WAYS : SCRYPT_MAX_WAYS) : 1) * 128 + 63);
 }
 
-static void scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
-    uint32_t *midstate, unsigned char *scratchpad, int N)
+static void scrypt_N_1_1_256(const uint32_t *input,
+    uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
 {
     uint32_t tstate[8], ostate[8];
     uint32_t X[32] __attribute__((aligned(128)));
     uint32_t *V;
 
-    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~(uintptr_t)(63));
 
     memcpy(tstate, midstate, 32);
     HMAC_SHA256_80_init(input, tstate, ostate);
@@ -601,7 +629,7 @@ static void scrypt_N_1_1_256_4way(const uint32_t *input,
     uint32_t *V;
     int i, k;
 
-    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~(uintptr_t)(63));
 
     for (i = 0; i < 20; i++)
         for (k = 0; k < 4; k++)
@@ -637,7 +665,7 @@ static void scrypt_N_1_1_256_3way(const uint32_t *input,
     uint32_t X[3 * 32] __attribute__((aligned(64)));
     uint32_t *V;
 
-    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~(uintptr_t)(63));
 
     memcpy(tstate +  0, midstate, 32);
     memcpy(tstate +  8, midstate, 32);
@@ -667,7 +695,7 @@ static void scrypt_N_1_1_256_12way(const uint32_t *input,
     uint32_t *V;
     int i, j, k;
 
-    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~(uintptr_t)(63));
 
     for (j = 0; j < 3; j++)
         for (i = 0; i < 20; i++)
@@ -718,7 +746,7 @@ static void scrypt_N_1_1_256_24way(const uint32_t *input,
     uint32_t *V;
     int i, j, k;
 
-    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+    V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~(uintptr_t)(63));
 
     for (j = 0; j < 3; j++)
         for (i = 0; i < 20; i++)
@@ -763,7 +791,7 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
         if (hash[i] > target[i]) {
             return false;
         }
-        if (hash[i] < target[i]) {
+        if (hash[i] <= target[i]) {
             return true;
         }
     }
@@ -774,14 +802,14 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
 bool scrypt_N_1_1_256_multi(void *input, uint256 hashTarget, int *nHashesDone, unsigned char *scratchbuf, int N)
 {
     uint32_t pdata[20];
-    uint32_t data[SCRYPT_MAX_WAYS * 20];
-    uint32_t dhash[SCRYPT_MAX_WAYS * 8];
+    uint32_t data[(2 * SCRYPT_MAX_WAYS) * 20];
+    uint32_t dhash[(2 * SCRYPT_MAX_WAYS) * 8];
     uint32_t midstate[8];
     uint32_t n;
-    int throughput = scrypt_best_throughput();
+    int throughput = (HAVE_AVX2 ? 2 * SCRYPT_BEST_THROUGHPUT : SCRYPT_BEST_THROUGHPUT);
     int i;
 
-    for (int i = 0; i < 20; i++)
+    for (i = 0; i < 20; i++)
         pdata[i] = be32dec(&((const uint32_t *)input)[i]);
     n = pdata[19];
 
@@ -836,7 +864,7 @@ bool scryptHash(const void *input, char *output, int N)
 {
     uint32_t midstate[8];
     uint32_t data[20];
-    unsigned char *scratchbuf = scrypt_buffer_alloc(N);
+    unsigned char *scratchbuf = scrypt_buffer_alloc(N, false);
 
     memset(output, 0, 32);
     if (!scratchbuf)
diff --git a/src/crypto/scrypt2.h b/src/crypto/scrypt_opt.h
similarity index 51%
rename from src/crypto/scrypt2.h
rename to src/crypto/scrypt_opt.h
index ca1814f..5a370bf 100644
--- a/src/crypto/scrypt2.h
+++ b/src/crypto/scrypt_opt.h
@@ -1,5 +1,9 @@
-#ifndef SCRYPT2_H
-#define SCRYPT2_H
+// Copyright (c) 2018-2019 The Simplicity developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef SCRYPT_OPT_H
+#define SCRYPT_OPT_H
 
 #if defined HAVE_CONFIG_H
 #include "config/simplicity-config.h"
@@ -9,27 +13,27 @@
 #include <stdint.h>
 #include <stddef.h>
 #include <inttypes.h>
+#include <compat/byteswap.h>
 #include "uint256.h"
 #include "utilstrencodings.h"
 
 
-static const int SCRYPT_SCRATCHPAD_SIZE = 134218239;
+//static const int SCRYPT_SCRATCHPAD_SIZE = 134218239;
 //static const int N = 1048576;
 
-int scrypt_best_throughput();
-
-bool scrypt_N_1_1_256_multi(void *input, uint256 hashTarget, int *nHashesDone, unsigned char *scratchbuf);
+bool scrypt_N_1_1_256_multi(void *input, uint256 hashTarget, int *nHashesDone, unsigned char *scratchbuf, int N);
 
 bool scryptHash(const void *input, char *output, int N);
-extern unsigned char *scrypt_buffer_alloc(int N);
+extern unsigned char *scrypt_buffer_alloc(int N, bool multiWay = true);
 extern "C" void scrypt_core(uint32_t *X, uint32_t *V, int N);
+void sha256_init(uint32_t *state);
 extern "C" void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
 
 #if defined(__x86_64__) && !defined(ENABLE_AVX2)
 #define SCRYPT_MAX_WAYS 12
 #define HAVE_SCRYPT_3WAY 1
 #define HAVE_SHA256_4WAY 1
-#define scrypt_best_throughput() 3;
+#define SCRYPT_BEST_THROUGHPUT 3
 extern "C" int sha256_use_4way();
 extern "C" void sha256_init_4way(uint32_t *state);
 extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
@@ -37,35 +41,63 @@ extern "C" void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
 #endif
 
 #if defined(__x86_64__) && defined(ENABLE_AVX2)
-#define SCRYPT_MAX_WAYS 24
-#define HAVE_SCRYPT_6WAY 1
+#define SCRYPT_MAX_WAYS 12
+#define HAVE_SCRYPT_3WAY 1
 #define HAVE_SHA256_4WAY 1
+#define SCRYPT_BEST_THROUGHPUT 3
+extern "C" int sha256_use_4way();
+extern "C" void sha256_init_4way(uint32_t *state);
+extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
+extern "C" void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
+
+//#define SCRYPT_MAX_WAYS 24
+#define HAVE_SCRYPT_6WAY 1
+//#define HAVE_SHA256_4WAY 1
 #define HAVE_SHA256_8WAY 1
-#define scrypt_best_throughput() 6;
+//#define SCRYPT_BEST_THROUGHPUT 6
 extern "C" int sha256_use_8way();
 extern "C" void sha256_init_8way(uint32_t *state);
 extern "C" void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
-extern "C" int sha256_use_4way();
-extern "C" void sha256_init_4way(uint32_t *state);
-extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
+//extern "C" int sha256_use_4way();
+//extern "C" void sha256_init_4way(uint32_t *state);
+//extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
 extern "C" void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
 #endif
 
 #if defined(__i386__)
 #define SCRYPT_MAX_WAYS 4
 #define HAVE_SHA256_4WAY 1
-#define scrypt_best_throughput() 1
-extern "C" void scrypt_core(uint32_t *X, uint32_t *V, int N);
+#define SCRYPT_BEST_THROUGHPUT 1
+extern "C" int sha256_use_4way();
+extern "C" void sha256_init_4way(uint32_t *state);
+extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
+#endif
+
+#if defined(__arm__) && defined(__APCS_32__)
+#if !defined(__ARM_NEON)
+#define SCRYPT_MAX_WAYS 1
+#define SCRYPT_BEST_THROUGHPUT 1
+#else
+#define SCRYPT_MAX_WAYS 12
+#define HAVE_SCRYPT_3WAY 1
+#define HAVE_SHA256_4WAY 1
+#define SCRYPT_BEST_THROUGHPUT 3
 extern "C" int sha256_use_4way();
 extern "C" void sha256_init_4way(uint32_t *state);
 extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
+extern "C" void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
+#endif
 #endif
 
-#define bswap_32_scrypt(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
-                   | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#if defined(__aarch64__)
+#define SCRYPT_MAX_WAYS 3
+#define HAVE_SCRYPT_3WAY 1
+#define SCRYPT_BEST_THROUGHPUT 3
+extern "C" void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
+#endif
 
 static inline uint32_t swab32(uint32_t v)
 {
-    return bswap_32_scrypt(v);
+    return bswap_32(v);
 }
 #endif
diff --git a/src/crypto/sha2-arm.S b/src/crypto/sha2-arm.S
index 58e883d..e86f3de 100644
--- a/src/crypto/sha2-arm.S
+++ b/src/crypto/sha2-arm.S
@@ -11,7 +11,7 @@
 #include "config/simplicity-config.h"
 #endif
 
-#if defined(__arm__) && defined(__APCS_32__)
+#if /*defined(USE_ASM) &&*/ defined(__arm__) && defined(__APCS_32__)
 
 
 
diff --git a/src/crypto/sha2-armv8.c b/src/crypto/sha2-armv8.c
new file mode 100644
index 0000000..99732f7
--- /dev/null
+++ b/src/crypto/sha2-armv8.c
@@ -0,0 +1,142 @@
+/*
+ *  ARMv8-A Cryptography Extension SHA256 support functions
+ *
+ *  Copyright (C) 2016, CriticalBlue Limited, All Rights Reserved
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(__aarch64__)
+
+#include <arm_neon.h>
+
+static const uint32_t sha256_k[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define Rx(T0, T1, K, W0, W1, W2, W3)      \
+    W0 = vsha256su0q_u32( W0, W1 );    \
+    d2 = d0;                           \
+    T1 = vaddq_u32( W1, K );           \
+    d0 = vsha256hq_u32( d0, d1, T0 );  \
+    d1 = vsha256h2q_u32( d1, d2, T0 ); \
+    W0 = vsha256su1q_u32( W0, W2, W3 );
+
+#define Ry(T0, T1, K, W1)                  \
+    d2 = d0;                           \
+    T1 = vaddq_u32( W1, K  );          \
+    d0 = vsha256hq_u32( d0, d1, T0 );  \
+    d1 = vsha256h2q_u32( d1, d2, T0 );
+
+#define Rz(T0)                             \
+    d2 = d0;                           \
+    d0 = vsha256hq_u32( d0, d1, T0 );  \
+    d1 = vsha256h2q_u32( d1, d2, T0 );
+
+void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
+{
+    /* declare variables */
+    uint32x4_t k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, ka, kb, kc, kd, ke, kf;
+    uint32x4_t s0, s1;
+    uint32x4_t w0, w1, w2, w3;
+    uint32x4_t d0, d1, d2;
+    uint32x4_t t0, t1;
+
+    /* set K0..Kf constants */
+    k0 = vld1q_u32(&sha256_k[0x00]);
+    k1 = vld1q_u32(&sha256_k[0x04]);
+    k2 = vld1q_u32(&sha256_k[0x08]);
+    k3 = vld1q_u32(&sha256_k[0x0c]);
+    k4 = vld1q_u32(&sha256_k[0x10]);
+    k5 = vld1q_u32(&sha256_k[0x14]);
+    k6 = vld1q_u32(&sha256_k[0x18]);
+    k7 = vld1q_u32(&sha256_k[0x1c]);
+    k8 = vld1q_u32(&sha256_k[0x20]);
+    k9 = vld1q_u32(&sha256_k[0x24]);
+    ka = vld1q_u32(&sha256_k[0x28]);
+    kb = vld1q_u32(&sha256_k[0x2c]);
+    kc = vld1q_u32(&sha256_k[0x30]);
+    kd = vld1q_u32(&sha256_k[0x34]);
+    ke = vld1q_u32(&sha256_k[0x38]);
+    kf = vld1q_u32(&sha256_k[0x3c]);
+
+    /* load state */
+    s0 = vld1q_u32(&state[0]);
+    s1 = vld1q_u32(&state[4]);
+
+    /* load message */
+    w0 = vld1q_u32(block);
+    w1 = vld1q_u32(block + 4);
+    w2 = vld1q_u32(block + 8);
+    w3 = vld1q_u32(block + 12);
+
+    if (swap) {
+        w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
+        w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
+        w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
+        w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));
+    }
+
+    /* initialize t0, d0, d1 */
+    t0 = vaddq_u32(w0, k0);
+    d0 = s0;
+    d1 = s1;
+
+    /* perform rounds of four */
+    Rx(t0, t1, k1, w0, w1, w2, w3);
+    Rx(t1, t0, k2, w1, w2, w3, w0);
+    Rx(t0, t1, k3, w2, w3, w0, w1);
+    Rx(t1, t0, k4, w3, w0, w1, w2);
+    Rx(t0, t1, k5, w0, w1, w2, w3);
+    Rx(t1, t0, k6, w1, w2, w3, w0);
+    Rx(t0, t1, k7, w2, w3, w0, w1);
+    Rx(t1, t0, k8, w3, w0, w1, w2);
+    Rx(t0, t1, k9, w0, w1, w2, w3);
+    Rx(t1, t0, ka, w1, w2, w3, w0);
+    Rx(t0, t1, kb, w2, w3, w0, w1);
+    Rx(t1, t0, kc, w3, w0, w1, w2);
+    Ry(t0, t1, kd, w1);
+    Ry(t1, t0, ke, w2);
+    Ry(t0, t1, kf, w3);
+    Rz(t1);
+
+    /* update state */
+    s0 = vaddq_u32(s0, d0);
+    s1 = vaddq_u32(s1, d1);
+
+    /* save state */
+    vst1q_u32(&state[0], s0);
+    vst1q_u32(&state[4], s1);
+}
+#endif
diff --git a/src/crypto/sha2-x64.S b/src/crypto/sha2-x64.S
index 74b2325..90bde71 100644
--- a/src/crypto/sha2-x64.S
+++ b/src/crypto/sha2-x64.S
@@ -15,7 +15,7 @@
     .section .note.GNU-stack,"",%progbits
 #endif
 
-#if defined(__x86_64__)
+#if /*defined(USE_ASM) &&*/ defined(__x86_64__)
 
     .data
     .p2align 4
@@ -1626,7 +1626,7 @@ sha256d_8preext2_24:
 sha256d_8preext2_30:
     .long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022
 
-#endif /* USE_AVX2 */
+#endif /* ENABLE_AVX2 */
 
 
     .text
@@ -1692,7 +1692,7 @@ _sha256_init_8way:
     popq    %rdi
 #endif
     ret
-#endif /* USE_AVX2 */
+#endif /* ENABLE_AVX2 */
 
 
 
@@ -1700,13 +1700,13 @@ _sha256_init_8way:
 
 
 
-#if defined(ENABLE_AVX2)
+#if defined(ENABLE_AVX)
 
 
 
 
 
-#endif /* USE_AVX */
+#endif /* ENABLE_AVX */
 
 
 #if defined(ENABLE_AVX2)
@@ -1715,16 +1715,16 @@ _sha256_init_8way:
 
 
 
-#endif /* USE_AVX2 */
+#endif /* ENABLE_AVX2 */
 
 
-#if defined(USE_XOP)
+#if defined(ENABLE_XOP)
 
 
 
 
 
-#endif /* USE_XOP */
+#endif /* ENABLE_XOP */
 
 
     .text
@@ -1879,7 +1879,7 @@ sha256_transform_4way_sse2_main_loop:
     jmp sha256_transform_4way_finish
 
 
-#if defined(ENABLE_AVX2)
+#if defined(ENABLE_AVX)
     .text
     .p2align 6
 sha256_transform_4way_core_avx:
@@ -5745,10 +5745,10 @@ sha256_transform_4way_core_avx:
     vpxor   %xmm2, %xmm7, %xmm7
     vpaddd  %xmm6, %xmm7, %xmm7
     jmp sha256_transform_4way_finish
-#endif /* USE_AVX */
+#endif /* ENABLE_AVX */
 
 
-#if defined(USE_XOP)
+#if defined(ENABLE_XOP)
     .text
     .p2align 6
 sha256_transform_4way_core_xop:
@@ -8462,7 +8462,7 @@ sha256_transform_4way_core_xop:
     vpxor   %xmm2, %xmm7, %xmm7
     vpaddd  %xmm6, %xmm7, %xmm7
     jmp sha256_transform_4way_finish
-#endif /* USE_XOP */
+#endif /* ENABLE_XOP */
 
 
     .data
@@ -12795,7 +12795,7 @@ sha256_transform_8way_finish:
 #endif
     ret
 
-#endif /* USE_AVX2 */
+#endif /* ENABLE_AVX2 */
 
 
     .data
@@ -18451,7 +18451,7 @@ sha256d_ms_4way_sse2_finish:
     ret
 
 
-#if defined(ENABLE_AVX2)
+#if defined(ENABLE_AVX)
 
     .p2align 6
 sha256d_ms_4way_avx:
@@ -22566,10 +22566,10 @@ sha256d_ms_4way_avx_finish:
 #endif
     ret
 
-#endif /* USE_AVX */
+#endif /* ENABLE_AVX */
 
 
-#if defined(USE_XOP)
+#if defined(ENABLE_XOP)
 
     .p2align 6
 sha256d_ms_4way_xop:
@@ -25489,7 +25489,7 @@ sha256d_ms_4way_xop_finish:
 #endif
     ret
 
-#endif /* USE_XOP */
+#endif /* ENABLE_XOP */
 
 
     .text
@@ -25502,7 +25502,7 @@ _sha256_use_4way:
     pushq   %rcx
     pushq   %rdx
 
-#if defined(ENABLE_AVX2)
+#if defined(ENABLE_AVX)
     /* Check for AVX and OSXSAVE support */
     movl    $1, %eax
     cpuid
@@ -25515,7 +25515,7 @@ _sha256_use_4way:
     andl    $0x00000006, %eax
     cmpl    $0x00000006, %eax
     jne sha256_use_4way_base
-#if defined(USE_XOP)
+#if defined(ENABLE_XOP)
     /* Check for XOP support */
     movl    $0x80000001, %eax
     cpuid
@@ -25526,13 +25526,13 @@ sha256_use_4way_xop:
     leaq    sha256d_ms_4way_xop(%rip), %rcx
     leaq    sha256_transform_4way_core_xop(%rip), %rdx
     jmp sha256_use_4way_done
-#endif /* USE_XOP */
+#endif /* ENABLE_XOP */
 
 sha256_use_4way_avx:
     leaq    sha256d_ms_4way_avx(%rip), %rcx
     leaq    sha256_transform_4way_core_avx(%rip), %rdx
     jmp sha256_use_4way_done
-#endif /* USE_AVX */
+#endif /* ENABLE_AVX */
 
 sha256_use_4way_base:
     leaq    sha256d_ms_4way_sse2(%rip), %rcx
@@ -29713,6 +29713,6 @@ sha256_use_8way_done:
     popq    %rbx
     ret
 
-#endif /* USE_AVX2 */
+#endif /* ENABLE_AVX2 */
 
 #endif
diff --git a/src/crypto/sha2-x86.S b/src/crypto/sha2-x86.S
index def2d51..16664c0 100644
--- a/src/crypto/sha2-x86.S
+++ b/src/crypto/sha2-x86.S
@@ -15,7 +15,7 @@
     .section .note.GNU-stack,"",%progbits
 #endif
 
-#if defined(__i386__)
+#if /*defined(USE_ASM) &&*/ defined(__i386__)
 
     .data
     .p2align 7
diff --git a/src/crypto/sha256.cpp b/src/crypto/sha256.cpp
index 8410e59..53ab277 100644
--- a/src/crypto/sha256.cpp
+++ b/src/crypto/sha256.cpp
@@ -3,13 +3,14 @@
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 
 #include "crypto/sha256.h"
+#include "crypto/scrypt_opt.h"
 
 #include "crypto/common.h"
 
 #include <string.h>
 
 // Internal implementation code.
-namespace
+/*namespace
 {
 /// Internal SHA-256 implementation.
 namespace sha256
@@ -19,19 +20,19 @@ uint32_t inline Maj(uint32_t x, uint32_t y, uint32_t z) { return (x & y) | (z &
 uint32_t inline Sigma0(uint32_t x) { return (x >> 2 | x << 30) ^ (x >> 13 | x << 19) ^ (x >> 22 | x << 10); }
 uint32_t inline Sigma1(uint32_t x) { return (x >> 6 | x << 26) ^ (x >> 11 | x << 21) ^ (x >> 25 | x << 7); }
 uint32_t inline sigma0(uint32_t x) { return (x >> 7 | x << 25) ^ (x >> 18 | x << 14) ^ (x >> 3); }
-uint32_t inline sigma1(uint32_t x) { return (x >> 17 | x << 15) ^ (x >> 19 | x << 13) ^ (x >> 10); }
+uint32_t inline sigma1(uint32_t x) { return (x >> 17 | x << 15) ^ (x >> 19 | x << 13) ^ (x >> 10); }*/
 
 /** One round of SHA-256. */
-void inline Round(uint32_t a, uint32_t b, uint32_t c, uint32_t& d, uint32_t e, uint32_t f, uint32_t g, uint32_t& h, uint32_t k, uint32_t w)
+/*void inline Round(uint32_t a, uint32_t b, uint32_t c, uint32_t& d, uint32_t e, uint32_t f, uint32_t g, uint32_t& h, uint32_t k, uint32_t w)
 {
     uint32_t t1 = h + Sigma1(e) + Ch(e, f, g) + k + w;
     uint32_t t2 = Sigma0(a) + Maj(a, b, c);
     d += t1;
     h = t1 + t2;
-}
+}*/
 
 /** Initialize SHA-256 state. */
-void inline Initialize(uint32_t* s)
+/*void inline Initialize(uint32_t* s)
 {
     s[0] = 0x6a09e667ul;
     s[1] = 0xbb67ae85ul;
@@ -41,10 +42,10 @@ void inline Initialize(uint32_t* s)
     s[5] = 0x9b05688cul;
     s[6] = 0x1f83d9abul;
     s[7] = 0x5be0cd19ul;
-}
+}*/
 
 /** Perform one SHA-256 transformation, processing a 64-byte chunk. */
-void Transform(uint32_t* s, const unsigned char* chunk)
+/*void Transform(uint32_t* s, const unsigned char* chunk)
 {
     uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7];
     uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
@@ -128,14 +129,15 @@ void Transform(uint32_t* s, const unsigned char* chunk)
 }
 
 } // namespace sha256
-} // namespace
+}*/ // namespace
 
 
 ////// SHA-256
 
 CSHA256::CSHA256() : bytes(0)
 {
-    sha256::Initialize(s);
+    //sha256::Initialize(s);
+    sha256_init(s);
 }
 
 CSHA256& CSHA256::Write(const unsigned char* data, size_t len)
@@ -147,12 +149,14 @@ CSHA256& CSHA256::Write(const unsigned char* data, size_t len)
         memcpy(buf + bufsize, data, 64 - bufsize);
         bytes += 64 - bufsize;
         data += 64 - bufsize;
-        sha256::Transform(s, buf);
+        //sha256::Transform(s, buf);
+        sha256_transform(s, (uint32_t*)buf, 1);
         bufsize = 0;
     }
     while (end >= data + 64) {
         // Process full chunks directly from the source.
-        sha256::Transform(s, data);
+        //sha256::Transform(s, data);
+        sha256_transform(s, (uint32_t*)data, 1);
         bytes += 64;
         data += 64;
     }
@@ -184,6 +188,7 @@ void CSHA256::Finalize(unsigned char hash[OUTPUT_SIZE])
 CSHA256& CSHA256::Reset()
 {
     bytes = 0;
-    sha256::Initialize(s);
+    //sha256::Initialize(s);
+    sha256_init(s);
     return *this;
 }
diff --git a/src/hash.cpp b/src/hash.cpp
index f470c11..58b307e 100644
--- a/src/hash.cpp
+++ b/src/hash.cpp
@@ -78,46 +78,4 @@ void BIP32Hash(const ChainCode chainCode, unsigned int nChild, unsigned char hea
     num[2] = (nChild >> 8) & 0xFF;
     num[3] = (nChild >> 0) & 0xFF;
     CHMAC_SHA512(chainCode.begin(), chainCode.size()).Write(&header, 1).Write(data, 32).Write(num, 4).Finalize(output);
-}
-
-uint256 scrypt_hash(const void* input, size_t inputlen)
-{
-    uint256 result = 0;
-
-    scrypt((const char*)input, inputlen, (const char*)input, inputlen, (char*)&result, 1024, 1, 1, 32);
-
-    return result;
-}
-
-uint256 scrypt_salted_hash(const void* input, size_t inputlen, const void* salt, size_t saltlen)
-{
-    uint256 result = 0;
-
-    scrypt((const char*)input, inputlen, (const char*)salt, saltlen, (char*)&result, 1024, 1, 1, 32);
-
-    return result;
-}
-
-uint256 scrypt_salted_multiround_hash(const void* input, size_t inputlen, const void* salt, size_t saltlen, const unsigned int nRounds)
-{
-    uint256 resultHash = scrypt_salted_hash(input, inputlen, salt, saltlen);
-    uint256 transitionalHash = resultHash;
-
-    for (unsigned int i = 1; i < nRounds; i++)
-    {
-        resultHash = scrypt_salted_hash(input, inputlen, (const void*)&transitionalHash, 32);
-        transitionalHash = resultHash;
-    }
-
-    return resultHash;
-}
-
-uint256 scrypt_blockhash(const void* input)
-{
-    return scrypt_hash(input, 80);
-}
-
-void scrypt_hash(const char* pass, unsigned int pLen, const char* salt, unsigned int sLen, char* output, unsigned int N, unsigned int r, unsigned int p, unsigned int dkLen)
-{
-    scrypt(pass, pLen, salt, sLen, output, N, r, p, dkLen);
 }
\ No newline at end of file
diff --git a/src/hash.h b/src/hash.h
index 327fd7a..4cbfca4 100644
--- a/src/hash.h
+++ b/src/hash.h
@@ -22,9 +22,10 @@
 #include "crypto/sph_jh.h"
 #include "crypto/sph_keccak.h"
 #include "crypto/sph_skein.h"
+#include "crypto/sha1.h"
 #include "crypto/sha512.h"
 #include "crypto/scrypt.h"
-#include "crypto/scrypt2.h"
+#include "crypto/scrypt_opt.h"
 
 #include <iomanip>
 #include <openssl/sha.h>
@@ -33,6 +34,8 @@
 
 
 typedef uint256 ChainCode;
+static const unsigned char PBLANK[1] = {};
+static const uint256 ZERO = uint256(0);
 
 /** A hasher class for Bitcoin's 256-bit hash (double SHA-256). */
 class CHash256
@@ -63,6 +66,34 @@ class CHash256
     }
 };
 
+class CHash1
+{
+private:
+    CSHA1 sha;
+
+public:
+    static const size_t OUTPUT_SIZE = CSHA1::OUTPUT_SIZE;
+
+    void Finalize(unsigned char hash[OUTPUT_SIZE])
+    {
+        unsigned char buf[CSHA1::OUTPUT_SIZE];
+        sha.Finalize(buf);
+        sha.Reset().Write(buf, CSHA1::OUTPUT_SIZE).Finalize(hash);
+    }
+
+    CHash1& Write(const unsigned char* data, size_t len)
+    {
+        sha.Write(data, len);
+        return *this;
+    }
+
+    CHash1& Reset()
+    {
+        sha.Reset();
+        return *this;
+    }
+};
+
 class CHash512
 {
 private:
@@ -179,17 +210,35 @@ inline void Hash(void* in, unsigned int len, unsigned char* out)
 template <typename T1>
 inline uint512 Hash512(const T1 pbegin, const T1 pend)
 {
-    static const unsigned char pblank[1] = {};
     uint512 result;
-    CHash512().Write(pbegin == pend ? pblank : (const unsigned char*)&pbegin[0], (pend - pbegin) * sizeof(pbegin[0])).Finalize((unsigned char*)&result);
+    CHash512().Write(pbegin == pend ? PBLANK : (const unsigned char*)&pbegin[0], (pend - pbegin) * sizeof(pbegin[0])).Finalize((unsigned char*)&result);
     return result;
 }
+
+/** Compute the 512-bit hash of the concatenation of two objects. */
 template <typename T1, typename T2>
 inline uint512 Hash512(const T1 p1begin, const T1 p1end, const T2 p2begin, const T2 p2end)
 {
-    static const unsigned char pblank[1] = {};
     uint512 result;
-    CHash512().Write(p1begin == p1end ? pblank : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? pblank : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Finalize((unsigned char*)&result);
+    CHash512().Write(p1begin == p1end ? PBLANK : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? PBLANK : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Finalize((unsigned char*)&result);
+    return result;
+}
+
+/** Compute the 160-bit hash of an object. */
+template <typename T1>
+inline uint256 Hash1(const T1 pbegin, const T1 pend)
+{
+    uint256 result;
+    CHash1().Write(pbegin == pend ? PBLANK : (const unsigned char*)&pbegin[0], (pend - pbegin) * sizeof(pbegin[0])).Finalize((unsigned char*)&result);
+    return result;
+}
+
+/** Compute the 160-bit hash of the concatenation of two objects. */
+template <typename T1, typename T2>
+inline uint256 Hash1(const T1 p1begin, const T1 p1end, const T2 p2begin, const T2 p2end)
+{
+    uint256 result;
+    CHash1().Write(p1begin == p1end ? PBLANK : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? PBLANK : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Finalize((unsigned char*)&result);
     return result;
 }
 
@@ -197,9 +246,8 @@ inline uint512 Hash512(const T1 p1begin, const T1 p1end, const T2 p2begin, const
 template <typename T1>
 inline uint256 Hash(const T1 pbegin, const T1 pend)
 {
-    static const unsigned char pblank[1] = {};
     uint256 result;
-    CHash256().Write(pbegin == pend ? pblank : (const unsigned char*)&pbegin[0], (pend - pbegin) * sizeof(pbegin[0])).Finalize((unsigned char*)&result);
+    CHash256().Write(pbegin == pend ? PBLANK : (const unsigned char*)&pbegin[0], (pend - pbegin) * sizeof(pbegin[0])).Finalize((unsigned char*)&result);
     return result;
 }
 
@@ -207,9 +255,8 @@ inline uint256 Hash(const T1 pbegin, const T1 pend)
 template <typename T1, typename T2>
 inline uint256 Hash(const T1 p1begin, const T1 p1end, const T2 p2begin, const T2 p2end)
 {
-    static const unsigned char pblank[1] = {};
     uint256 result;
-    CHash256().Write(p1begin == p1end ? pblank : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? pblank : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Finalize((unsigned char*)&result);
+    CHash256().Write(p1begin == p1end ? PBLANK : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? PBLANK : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Finalize((unsigned char*)&result);
     return result;
 }
 
@@ -217,49 +264,44 @@ inline uint256 Hash(const T1 p1begin, const T1 p1end, const T2 p2begin, const T2
 template <typename T1, typename T2, typename T3>
 inline uint256 Hash(const T1 p1begin, const T1 p1end, const T2 p2begin, const T2 p2end, const T3 p3begin, const T3 p3end)
 {
-    static const unsigned char pblank[1] = {};
     uint256 result;
-    CHash256().Write(p1begin == p1end ? pblank : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? pblank : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Write(p3begin == p3end ? pblank : (const unsigned char*)&p3begin[0], (p3end - p3begin) * sizeof(p3begin[0])).Finalize((unsigned char*)&result);
+    CHash256().Write(p1begin == p1end ? PBLANK : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? PBLANK : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Write(p3begin == p3end ? PBLANK : (const unsigned char*)&p3begin[0], (p3end - p3begin) * sizeof(p3begin[0])).Finalize((unsigned char*)&result);
     return result;
 }
 
-/** Compute the 256-bit hash of the concatenation of three objects. */
+/** Compute the 256-bit hash of the concatenation of four objects. */
 template <typename T1, typename T2, typename T3, typename T4>
 inline uint256 Hash(const T1 p1begin, const T1 p1end, const T2 p2begin, const T2 p2end, const T3 p3begin, const T3 p3end, const T4 p4begin, const T4 p4end)
 {
-    static const unsigned char pblank[1] = {};
     uint256 result;
-    CHash256().Write(p1begin == p1end ? pblank : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? pblank : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Write(p3begin == p3end ? pblank : (const unsigned char*)&p3begin[0], (p3end - p3begin) * sizeof(p3begin[0])).Write(p4begin == p4end ? pblank : (const unsigned char*)&p4begin[0], (p4end - p4begin) * sizeof(p4begin[0])).Finalize((unsigned char*)&result);
+    CHash256().Write(p1begin == p1end ? PBLANK : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? PBLANK : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Write(p3begin == p3end ? PBLANK : (const unsigned char*)&p3begin[0], (p3end - p3begin) * sizeof(p3begin[0])).Write(p4begin == p4end ? PBLANK : (const unsigned char*)&p4begin[0], (p4end - p4begin) * sizeof(p4begin[0])).Finalize((unsigned char*)&result);
     return result;
 }
 
-/** Compute the 256-bit hash of the concatenation of three objects. */
+/** Compute the 256-bit hash of the concatenation of five objects. */
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 inline uint256 Hash(const T1 p1begin, const T1 p1end, const T2 p2begin, const T2 p2end, const T3 p3begin, const T3 p3end, const T4 p4begin, const T4 p4end, const T5 p5begin, const T5 p5end)
 {
-    static const unsigned char pblank[1] = {};
     uint256 result;
-    CHash256().Write(p1begin == p1end ? pblank : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? pblank : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Write(p3begin == p3end ? pblank : (const unsigned char*)&p3begin[0], (p3end - p3begin) * sizeof(p3begin[0])).Write(p4begin == p4end ? pblank : (const unsigned char*)&p4begin[0], (p4end - p4begin) * sizeof(p4begin[0])).Write(p5begin == p5end ? pblank : (const unsigned char*)&p5begin[0], (p5end - p5begin) * sizeof(p5begin[0])).Finalize((unsigned char*)&result);
+    CHash256().Write(p1begin == p1end ? PBLANK : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? PBLANK : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Write(p3begin == p3end ? PBLANK : (const unsigned char*)&p3begin[0], (p3end - p3begin) * sizeof(p3begin[0])).Write(p4begin == p4end ? PBLANK : (const unsigned char*)&p4begin[0], (p4end - p4begin) * sizeof(p4begin[0])).Write(p5begin == p5end ? PBLANK : (const unsigned char*)&p5begin[0], (p5end - p5begin) * sizeof(p5begin[0])).Finalize((unsigned char*)&result);
     return result;
 }
 
-/** Compute the 256-bit hash of the concatenation of three objects. */
+/** Compute the 256-bit hash of the concatenation of six objects. */
 template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
 inline uint256 Hash(const T1 p1begin, const T1 p1end, const T2 p2begin, const T2 p2end, const T3 p3begin, const T3 p3end, const T4 p4begin, const T4 p4end, const T5 p5begin, const T5 p5end, const T6 p6begin, const T6 p6end)
 {
-    static const unsigned char pblank[1] = {};
     uint256 result;
-    CHash256().Write(p1begin == p1end ? pblank : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? pblank : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Write(p3begin == p3end ? pblank : (const unsigned char*)&p3begin[0], (p3end - p3begin) * sizeof(p3begin[0])).Write(p4begin == p4end ? pblank : (const unsigned char*)&p4begin[0], (p4end - p4begin) * sizeof(p4begin[0])).Write(p5begin == p5end ? pblank : (const unsigned char*)&p5begin[0], (p5end - p5begin) * sizeof(p5begin[0])).Write(p6begin == p6end ? pblank : (const unsigned char*)&p6begin[0], (p6end - p6begin) * sizeof(p6begin[0])).Finalize((unsigned char*)&result);
+    CHash256().Write(p1begin == p1end ? PBLANK : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0])).Write(p2begin == p2end ? PBLANK : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0])).Write(p3begin == p3end ? PBLANK : (const unsigned char*)&p3begin[0], (p3end - p3begin) * sizeof(p3begin[0])).Write(p4begin == p4end ? PBLANK : (const unsigned char*)&p4begin[0], (p4end - p4begin) * sizeof(p4begin[0])).Write(p5begin == p5end ? PBLANK : (const unsigned char*)&p5begin[0], (p5end - p5begin) * sizeof(p5begin[0])).Write(p6begin == p6end ? PBLANK : (const unsigned char*)&p6begin[0], (p6end - p6begin) * sizeof(p6begin[0])).Finalize((unsigned char*)&result);
     return result;
 }
 
-/** Compute the 160-bit hash an object. */
+/** Compute the 160-bit hash of an object. */
 template <typename T1>
 inline uint160 Hash160(const T1 pbegin, const T1 pend)
 {
-    static unsigned char pblank[1] = {};
     uint160 result;
-    CHash160().Write(pbegin == pend ? pblank : (const unsigned char*)&pbegin[0], (pend - pbegin) * sizeof(pbegin[0])).Finalize((unsigned char*)&result);
+    CHash160().Write(pbegin == pend ? PBLANK : (const unsigned char*)&pbegin[0], (pend - pbegin) * sizeof(pbegin[0])).Finalize((unsigned char*)&result);
     return result;
 }
 
@@ -317,10 +359,47 @@ unsigned int MurmurHash3(unsigned int nHashSeed, const std::vector<unsigned char
 
 void BIP32Hash(const ChainCode chainCode, unsigned int nChild, unsigned char header, const unsigned char data[32], unsigned char output[64]);
 
-uint256 scrypt_salted_multiround_hash(const void* input, size_t inputlen, const void* salt, size_t saltlen, const unsigned int nRounds);
-uint256 scrypt_salted_hash(const void* input, size_t inputlen, const void* salt, size_t saltlen);
-uint256 scrypt_hash(const void* input, size_t inputlen/*, const unsigned int N=1024*/);
-uint256 scrypt_blockhash(const void* input);
+inline uint256 scrypt_hash(const void* input, size_t inputlen, const unsigned int N=1024)
+{
+    uint256 result;
+    scrypt((const char*)input, inputlen, (const char*)input, inputlen, (char*)&result, N, 1, 1, 32);
+    return result;
+}
+
+inline uint256 scrypt_salted_hash(const void* input, size_t inputlen, const void* salt, size_t saltlen)
+{
+    uint256 result;
+    scrypt((const char*)input, inputlen, (const char*)salt, saltlen, (char*)&result, 1024, 1, 1, 32);
+    return result;
+}
+
+inline uint256 scrypt_salted_multiround_hash(const void* input, size_t inputlen, const void* salt, size_t saltlen, const unsigned int nRounds)
+{
+    uint256 resultHash = scrypt_salted_hash(input, inputlen, salt, saltlen);
+    uint256 transitionalHash = resultHash;
+
+    for (unsigned int i = 1; i < nRounds; i++) {
+        resultHash = scrypt_salted_hash(input, inputlen, (const void*)&transitionalHash, 32);
+        transitionalHash = resultHash;
+    }
+
+    return resultHash;
+}
+
+inline uint256 scrypt_blockhash(const void* input)
+{
+    //return scrypt_hash(input, 80);
+    uint256 result;
+    scryptHash(input, (char*)&result, 1024); //fixed length of 80
+    return result;
+}
+
+inline uint256 scrypt_squared_blockhash(const void* input)
+{
+    uint256 result;
+    scryptHash(input, (char*)&result, 1048576); //fixed length of 80
+    return result;
+}
 
 //int HMAC_SHA512_Init(HMAC_SHA512_CTX *pctx, const void *pkey, size_t len);
 //int HMAC_SHA512_Update(HMAC_SHA512_CTX *pctx, const void *pdata, size_t len);
@@ -336,7 +415,6 @@ inline uint256 HashQuark(const T1 pbegin, const T1 pend)
     sph_jh512_context ctx_jh;
     sph_keccak512_context ctx_keccak;
     sph_skein512_context ctx_skein;
-    static unsigned char pblank[1];
 
     uint512 mask = 8;
     uint512 zero = 0;
@@ -345,7 +423,7 @@ inline uint256 HashQuark(const T1 pbegin, const T1 pend)
 
     sph_blake512_init(&ctx_blake);
     // ZBLAKE;
-    sph_blake512(&ctx_blake, (pbegin == pend ? pblank : static_cast<const void*>(&pbegin[0])), (pend - pbegin) * sizeof(pbegin[0]));
+    sph_blake512(&ctx_blake, (pbegin == pend ? PBLANK : static_cast<const void*>(&pbegin[0])), (pend - pbegin) * sizeof(pbegin[0]));
     sph_blake512_close(&ctx_blake, static_cast<void*>(&hash[0]));
 
     sph_bmw512_init(&ctx_bmw);
@@ -415,22 +493,29 @@ inline uint256 HashQuark(const T1 pbegin, const T1 pend)
 template <typename T1>
 inline uint256 HashScrypt(const T1 pbegin, const T1 pend)
 {
-    static unsigned char pblank[1];
-    return scrypt_hash((pbegin == pend ? pblank : static_cast<const void*>(&pbegin[0])), (pend - pbegin) * sizeof(pbegin[0]));
+    uint256 result;
+    if ((pend - pbegin) * sizeof(pbegin[0]) != 80 || !scryptHash(static_cast<const void*>(&pbegin[0]), (char*)&result, 1024) || result == ZERO) {
+        LogPrintf("Falling back to original implementation to generate normal scrypt hash\n");
+        return scrypt_hash((pbegin == pend ? PBLANK : static_cast<const void*>(&pbegin[0])), (pend - pbegin) * sizeof(pbegin[0]));
+    }
+    return result;
 }
 
-/* ----------- Scrypt^2 Hash ------------------------------------------------ */
+/* ----------- Scrypt² Hash ------------------------------------------------ */
 template <typename T1>
 inline uint256 HashScryptSquared(const T1 pbegin, const T1 pend)
 {
-    static unsigned char pblank[1];
-    //return scrypt_hash((pbegin == pend ? pblank : static_cast<const void*>(&pbegin[0])), (pend - pbegin) * sizeof(pbegin[0]), 1048576);
-    uint256 result = ~uint256(0);
-    if (!scryptHash((pbegin == pend ? pblank : static_cast<const void*>(&pbegin[0])), (char*)&result, 1048576))
-        LogPrintf("Failed to generate scrypt² hash!\n");
+    uint256 result;
+    if ((pend - pbegin) * sizeof(pbegin[0]) != 80 || !scryptHash(static_cast<const void*>(&pbegin[0]), (char*)&result, 1048576) || result == ZERO) {
+        LogPrintf("Falling back to original implementation to generate scrypt² hash\n");
+        return scrypt_hash((pbegin == pend ? PBLANK : static_cast<const void*>(&pbegin[0])), (pend - pbegin) * sizeof(pbegin[0]), 1048576);
+    }
     return result;
 }
 
-void scrypt_hash(const char* pass, unsigned int pLen, const char* salt, unsigned int sLen, char* output, unsigned int N, unsigned int r, unsigned int p, unsigned int dkLen);
+inline void scrypt_hash(const char* pass, unsigned int pLen, const char* salt, unsigned int sLen, char* output, unsigned int N, unsigned int r, unsigned int p, unsigned int dkLen)
+{
+    scrypt(pass, pLen, salt, sLen, output, N, r, p, dkLen);
+}
 
 #endif // SIMPLICITY_HASH_H
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 815e996..79dc8e4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -2095,7 +2095,7 @@ bool ReadBlockFromDisk(CBlock& block, const CDiskBlockPos& pos)
 
     // Check the header
     // treat PoW and PoS blocks the same - don't waste time on redundant PoW checks that won't catch invalid PoS blocks anyway
-    if (block.GetHash() != Params().HashGenesisBlock() && block.IsProofOfWork() && CBlockHeader::GetAlgo(block.nVersion) != POW_SCRYPT_SQUARED && !CheckProofOfWork(&block))
+    if (block.IsProofOfWork() && CBlockHeader::GetAlgo(block.nVersion) != POW_SCRYPT_SQUARED && !CheckProofOfWork(&block))
         return error("ReadBlockFromDisk : Errors in block header");
 
     return true;
@@ -3079,7 +3079,7 @@ bool ConnectBlock(const CBlock& block, CValidationState& state, CBlockIndex* pin
         // return state.DoS(100, error("ConnectBlock() : PoW period ended"),
             // REJECT_INVALID, "PoW-ended");
 
-    if (block.nVersion < Params().WALLET_UPGRADE_VERSION() && /*block.GetHash() != Params().HashGenesisBlock() &&*/ !CheckWork(block, pindex->pprev))
+    if ((fVerifyingBlocks || fReindex || block.nVersion < Params().WALLET_UPGRADE_VERSION()) && /*block.GetHash() != Params().HashGenesisBlock() &&*/ !CheckWork(block, pindex->pprev))
         return false;
 
     if (block.IsProofOfStake()) {
@@ -3582,14 +3582,13 @@ void static UpdateTip(CBlockIndex* pindexNew)
         int nUpgraded = 0;
         const CBlockIndex* pindex = chainActive.Tip();
         for (int i = 0; i < 100 && pindex != NULL; i++) {
-            if (pindex->nVersion > ALGO_POW_SCRYPT_SQUARED)
+            if (pindex->nVersion > (uint32_t)ALGO_POW_SCRYPT_SQUARED)
                 ++nUpgraded;
             pindex = pindex->pprev;
         }
         if (nUpgraded > 0)
-            LogPrintf("%s: %d of last 100 blocks above version %d\n", __func__, nUpgraded, ALGO_POW_SCRYPT_SQUARED);
-        if (nUpgraded > 100/2)
-        {
+            LogPrintf("%s: %i of last 100 blocks above version %u\n", __func__, nUpgraded, ALGO_POW_SCRYPT_SQUARED);
+        if (nUpgraded > 100/2) {
             // strMiscWarning is read by GetWarnings(), called by Qt and the JSON-RPC code to warn the user:
             strMiscWarning = _("Warning: This version is obsolete; upgrade required!");
             CAlert::Notify(strMiscWarning, true);
@@ -4320,7 +4319,7 @@ bool CheckBlockHeader(const CBlockHeader& block, CValidationState& state, bool f
         return state.DoS(100, error("%s : block %s has an invalid type", __func__, block.GetHash().GetHex()));
 
     // Check proof of work matches claimed amount
-    if (block.GetHash() != Params().HashGenesisBlock() && (fVerifyingBlocks || fReindex || block.nTime >= nBlockCheckTime || CBlockHeader::GetAlgo(block.nVersion) != POW_SCRYPT_SQUARED) && fCheckPOW && block.IsProofOfWork() && !CheckProofOfWork(&block))
+    if ((fVerifyingBlocks || fReindex || block.nTime >= nBlockCheckTime || CBlockHeader::GetAlgo(block.nVersion) != POW_SCRYPT_SQUARED) && fCheckPOW && block.IsProofOfWork() && !CheckProofOfWork(&block))
         return state.DoS(50, error("%s : proof of work failed", __func__),
             REJECT_INVALID, "high-hash");
 
@@ -4544,7 +4543,7 @@ bool ContextualCheckBlockHeader(const CBlockHeader& block, CValidationState& sta
 
         if (Params().NetworkID() != CBaseChainParams::REGTEST && nHeight >= 10 + Params().WALLET_UPGRADE_BLOCK() + Params().COINSTAKE_MIN_DEPTH()) {
             int end = std::max(std::min(nHeight - 9 - Params().WALLET_UPGRADE_BLOCK() - Params().COINSTAKE_MIN_DEPTH(), 10), 0); // start checking one more at a time until we can enforce on all new blocks
-            int typeCount[ALGO_COUNT] = { };
+            int typeCount[ALGO_COUNT] = {};
             //int proofOfWorkCount = 0;
             if (CBlockHeader::GetAlgo(block.nVersion) == -1)
                 return false;
@@ -4616,7 +4615,7 @@ bool ContextualCheckBlockHeader(const CBlockHeader& block, CValidationState& sta
         return state.DoS(0, error("%s : forked chain older than last checkpoint (height %d)", __func__, nHeight));
 
     // Reject block.nVersion=1, ..., CURRENT_VERSION-1 blocks when 95% (75% on testnet) of the network has upgraded:
-    for (int version = 2; version <= CBlockHeader::CURRENT_VERSION; version++) {
+    for (unsigned int version = 2; version <= CBlockHeader::CURRENT_VERSION; version++) {
         if (block.nVersion < version && CBlockIndex::IsSuperMajority(version, pindexPrev, Params().RejectBlockOutdatedMajority())) {
             return state.Invalid(error("%s : rejected nVersion=%d block", __func__, block.nVersion), REJECT_OBSOLETE, "bad-version");
         }
@@ -4693,7 +4692,7 @@ static bool AcceptBlockHeader(const CBlockHeader& block, CValidationState& state
             return true;
         }
 
-        if (block.nVersion >= Params().WALLET_UPGRADE_VERSION() && !CheckBlockHeader(block, state, !fAlreadyCheckedHeader)) {
+        if (!CheckBlockHeader(block, state, !fAlreadyCheckedHeader && (block.nNonce != 0 || block.nVersion >= Params().WALLET_UPGRADE_VERSION()))) { //nNonce = 0 for PoS blocks
             LogPrintf("%s : CheckBlockHeader failed\n", __func__);
             return false;
         }
@@ -4792,7 +4791,7 @@ static bool AcceptBlock(CBlock& block, CValidationState& state, CBlockIndex** pp
     return true;
 }
 
-bool CBlockIndex::IsSuperMajority(int minVersion, const CBlockIndex* pstart, unsigned int nRequired)
+bool CBlockIndex::IsSuperMajority(unsigned int minVersion, const CBlockIndex* pstart, unsigned int nRequired)
 {
     unsigned int nToCheck = Params().ToCheckBlockUpgradeMajority();
     unsigned int nFound = 0;
@@ -4886,7 +4885,7 @@ bool ProcessNewBlock(CValidationState& state, CNode* pfrom, CBlock* pblock, bool
     if (!ActivateBestChain(state, pblock, checked))
         return error("%s : ActivateBestChain failed", __func__);
 
-    LogPrintf("%s : ACCEPTED Block %ld in %ld milliseconds with size=%d\n", __func__, GetHeight(), GetTimeMillis() - nStartTime,
+    LogPrint("net", "%s : ACCEPTED Block %ld in %ld milliseconds with size=%d\n", __func__, GetHeight(), GetTimeMillis() - nStartTime,
               pblock->GetSerializeSize(SER_DISK, CLIENT_VERSION));
 
     return true;
@@ -6616,7 +6615,6 @@ bool static ProcessMessage(CNode* pfrom, std::string strCommand, CDataStream& vR
         }
 
         if (GetBoolArg("-headerspamfilter", DEFAULT_HEADER_SPAM_FILTER) && !IsInitialBlockDownload()) {
-            LOCK(cs_main);
             CValidationState state;
             CNodeState *nodestate = State(pfrom->GetId());
             nodestate->headers.addHeaders(nFirst, nLast);
diff --git a/src/miner.cpp b/src/miner.cpp
index 28f59f0..b6de4bd 100644
--- a/src/miner.cpp
+++ b/src/miner.cpp
@@ -108,7 +108,7 @@ CBlockTemplate* CreateNewBlock(const CScript& scriptPubKeyIn, CWallet* pwallet,
     if (!pblocktemplate.get())
         return NULL;
     CBlock* pblock = &pblocktemplate->block; // pointer for convenience
-    int ver = 0;
+    uint32_t ver = 0;
 
     // Tip
     CBlockIndex* pindexPrev = nullptr;
@@ -475,7 +475,7 @@ CBlockTemplate* CreateNewBlock(const CScript& scriptPubKeyIn, CWallet* pwallet,
 
         nLastBlockTx = nBlockTx;
         nLastBlockSize = nBlockSize;
-        LogPrintf("CreateNewBlock(): total size %u\n", nBlockSize);
+        LogPrint("simplicity", "CreateNewBlock(): total size %u\n", nBlockSize);
 
         // Compute final coinbase transaction.
         if (!fProofOfStake) {
@@ -581,7 +581,7 @@ void IncrementExtraNonce(CBlock* pblock, CBlockIndex* pindexPrev, unsigned int&
 //
 // Internal miner
 //
-double dHashesPerSec = 0.0;
+double dHashesPerMin = 0.0;
 int64_t nHPSTimerStart = 0;
 
 CBlockTemplate* CreateNewBlockWithKey(CReserveKey& reservekey, CWallet* pwallet)
@@ -647,179 +647,224 @@ void BitcoinMiner(CWallet* pwallet, bool fProofOfStake)
     SetThreadPriority(THREAD_PRIORITY_LOWEST);
     RenameThread("simplicity-miner");
 
+    // Build buffer and check for memory availability
+    bool memory = true;
+    unsigned char *scratchbuf = nullptr;
+    if (nCreateBlockAlgo == POW_SCRYPT_SQUARED) {
+        scratchbuf = scrypt_buffer_alloc(1048576);
+        if (!scratchbuf) {
+            memory = false;
+            LogPrintf("Failed to allocate memory for scrypt² mining thread!\n");
+        }
+    }
+
     // Each thread has its own key and counter
     CReserveKey reservekey(pwallet);
     unsigned int nExtraNonce = 0;
     bool fLastLoopOrphan = false;
-    while (fGenerateBitcoins || fProofOfStake) {
-        if (fProofOfStake) {
-            //control the amount of times the client will check for mintable coins
-            if ((GetTime() - nMintableLastCheck > 5 * 60)) // 5 minute check time
-            {
-                nMintableLastCheck = GetTime();
-                fMintableCoins = pwallet->MintableCoins();
-            }
-
-            if (chainActive.Height() + 1 < Params().WALLET_UPGRADE_BLOCK() && Params().NetworkID() == CBaseChainParams::MAIN) {
-                MilliSleep(5000);
-                continue; // Do not stake until the upgrade block
-            }
-
-            while (pwallet->IsLocked() || !fMintableCoins || (pwallet->GetBalance() > 0 && nReserveBalance >= pwallet->GetBalance()) ||
-                   ((vNodes.empty() || masternodeSync.NotCompleted()) && Params().MiningRequiresPeers())) {
-                nLastCoinStakeSearchInterval = 0;
-                MilliSleep(5000);
-                // Do a separate 1 minute check here to ensure fMintableCoins is updated
-                if (!fMintableCoins && (GetTime() - nMintableLastCheck > 1 * 60)) // 1 minute check time
+    try {
+        while ((fGenerateBitcoins && memory) || fProofOfStake) {
+            if (fProofOfStake) {
+                //control the amount of times the client will check for mintable coins
+                if ((GetTime() - nMintableLastCheck > 5 * 60)) // 5 minute check time
                 {
                     nMintableLastCheck = GetTime();
                     fMintableCoins = pwallet->MintableCoins();
                 }
-            }
 
-            //search our map of hashed blocks, see if bestblock has been hashed yet
-            if (mapHashedBlocks.count(chainActive.Tip()->nHeight) && !fLastLoopOrphan)
-            {
-                // wait half of the nHashDrift with max wait of 3 minutes
-                if (GetTime() - mapHashedBlocks[chainActive.Tip()->nHeight] < std::max(pwallet->nHashInterval, (unsigned int)1))
-                {
+                if (chainActive.Height() + 1 < Params().WALLET_UPGRADE_BLOCK() && Params().NetworkID() == CBaseChainParams::MAIN) {
                     MilliSleep(5000);
-                    continue;
+                    continue; // Do not stake until the upgrade block
+                }
+
+                while (pwallet->IsLocked() || !fMintableCoins || (pwallet->GetBalance() > 0 && nReserveBalance >= pwallet->GetBalance()) ||
+                       ((vNodes.empty() || masternodeSync.NotCompleted()) && Params().MiningRequiresPeers())) {
+                    nLastCoinStakeSearchInterval = 0;
+                    MilliSleep(5000);
+                    // Do a separate 1 minute check here to ensure fMintableCoins is updated
+                    if (!fMintableCoins && (GetTime() - nMintableLastCheck > 1 * 60)) // 1 minute check time
+                    {
+                        nMintableLastCheck = GetTime();
+                        fMintableCoins = pwallet->MintableCoins();
+                    }
+                }
+
+                //search our map of hashed blocks, see if bestblock has been hashed yet
+                if (mapHashedBlocks.count(chainActive.Tip()->nHeight) && !fLastLoopOrphan)
+                {
+                    // wait half of the nHashDrift with max wait of 3 minutes
+                    if (GetTime() - mapHashedBlocks[chainActive.Tip()->nHeight] < std::max(pwallet->nHashInterval, (unsigned int)1))
+                    {
+                        MilliSleep(5000);
+                        continue;
+                    }
                 }
             }
-        }
 
-        MilliSleep(1000);
+            //MilliSleep(1000);
 
-        //
-        // Create new block
-        //
-        unsigned int nTransactionsUpdatedLast = mempool.GetTransactionsUpdated();
-        CBlockIndex* pindexPrev = chainActive.Tip();
-        if (!pindexPrev)
-            continue;
+            //
+            // Create new block
+            //
+            unsigned int nTransactionsUpdatedLast = mempool.GetTransactionsUpdated();
+            CBlockIndex* pindexPrev = chainActive.Tip();
+            if (!pindexPrev)
+                continue;
 
-        std::unique_ptr<CBlockTemplate> pblocktemplate(
-                fProofOfStake ? CreateNewBlock(CScript(), pwallet, fProofOfStake) : CreateNewBlockWithKey(reservekey, pwallet)
-                        );
-        if (!pblocktemplate.get())
-            continue;
+            std::unique_ptr<CBlockTemplate> pblocktemplate(
+                    fProofOfStake ? CreateNewBlock(CScript(), pwallet, fProofOfStake) : CreateNewBlockWithKey(reservekey, pwallet)
+                            );
+            if (!pblocktemplate.get())
+                continue;
 
-        CBlock* pblock = &pblocktemplate->block;
-        IncrementExtraNonce(pblock, pindexPrev, nExtraNonce);
+            CBlock* pblock = &pblocktemplate->block;
+            IncrementExtraNonce(pblock, pindexPrev, nExtraNonce);
 
-        //Stake miner main
-        if (fProofOfStake) {
-            LogPrintf("CPUMiner : proof-of-stake block found %s\n", pblock->GetHash().ToString().c_str());
-            if (pblock->IsZerocoinStake()) {
-                //Find the key associated with the zerocoin that is being staked
-                libzerocoin::CoinSpend spend = TxInToZerocoinSpend(pblock->vtx[1].vin[0]);
-                CBigNum bnSerial = spend.getCoinSerialNumber();
-                CKey key;
-                if (!pwallet->GetZerocoinKey(bnSerial, key)) {
-                    LogPrintf("%s: failed to find zSPL with serial %s, unable to sign block\n", __func__, bnSerial.GetHex());
+            //Stake miner main
+            if (fProofOfStake) {
+                LogPrintf("CPUMiner : proof-of-stake block found %s\n", pblock->GetHash().ToString().c_str());
+                if (pblock->IsZerocoinStake()) {
+                    //Find the key associated with the zerocoin that is being staked
+                    libzerocoin::CoinSpend spend = TxInToZerocoinSpend(pblock->vtx[1].vin[0]);
+                    CBigNum bnSerial = spend.getCoinSerialNumber();
+                    CKey key;
+                    if (!pwallet->GetZerocoinKey(bnSerial, key)) {
+                        LogPrintf("%s: failed to find zSPL with serial %s, unable to sign block\n", __func__, bnSerial.GetHex());
+                        continue;
+                    }
+
+                    //Sign block with the zSPL key
+                    if (!SignBlockWithKey(*pblock, key)) {
+                        LogPrintf("%s: Signing new block with zSPL key failed\n", __func__);
+                        continue;
+                    }
+                } else if (!SignBlock(*pblock, *pwallet)) {
+                    LogPrintf("%s: Signing new block with UTXO key failed\n", __func__);
                     continue;
                 }
 
-                //Sign block with the zSPL key
-                if (!SignBlockWithKey(*pblock, key)) {
-                    LogPrintf("%s: Signing new block with zSPL key failed\n", __func__);
+                LogPrintf("CPUMiner : proof-of-stake block was signed %s\n", pblock->GetHash().ToString().c_str());
+                SetThreadPriority(THREAD_PRIORITY_NORMAL);
+                if (!ProcessBlockFound(pblock, *pwallet, reservekey)) {
+                    fLastLoopOrphan = true;
                     continue;
                 }
-            } else if (!SignBlock(*pblock, *pwallet)) {
-                LogPrintf("%s: Signing new block with UTXO key failed\n", __func__);
-                continue;
-            }
+                SetThreadPriority(THREAD_PRIORITY_LOWEST);
 
-            LogPrintf("CPUMiner : proof-of-stake block was signed %s\n", pblock->GetHash().ToString().c_str());
-            SetThreadPriority(THREAD_PRIORITY_NORMAL);
-            if (!ProcessBlockFound(pblock, *pwallet, reservekey)) {
-                fLastLoopOrphan = true;
                 continue;
             }
-            SetThreadPriority(THREAD_PRIORITY_LOWEST);
-
-            continue;
-        }
-
-        LogPrintf("Running SimplicityMiner with %u transactions in block (%u bytes)\n", pblock->vtx.size(),
-            ::GetSerializeSize(*pblock, SER_NETWORK, PROTOCOL_VERSION));
 
-        //
-        // Search
-        //
-        int64_t nStart = GetTime();
-        uint256 hashTarget = uint256().SetCompact(pblock->nBits);
-        while (true) {
-            unsigned int nHashesDone = 0;
+            LogPrint("simplicity", "Running SimplicityMiner with %u transactions in block (%u bytes)\n", pblock->vtx.size(),
+                ::GetSerializeSize(*pblock, SER_NETWORK, PROTOCOL_VERSION));
 
-            uint256 hash;
+            //
+            // Search
+            //
+            int64_t nStart = GetTime();
+            uint256 hashTarget = uint256().SetCompact(pblock->nBits);
             while (true) {
-                hash = pblock->GetPoWHash();
-                if (hash <= hashTarget) {
-                    // Found a solution
-                    SetThreadPriority(THREAD_PRIORITY_NORMAL);
-                    LogPrintf("%s:\n", __func__);
-                    LogPrintf("proof-of-work found  \n  hash: %s  \ntarget: %s\n", hash.GetHex(), hashTarget.GetHex());
-                    ProcessBlockFound(pblock, *pwallet, reservekey);
-                    SetThreadPriority(THREAD_PRIORITY_LOWEST);
-
-                    // In regression test mode, stop mining after a block is found. This
-                    // allows developers to controllably generate a block on demand.
-                    if (Params().MineBlocksOnDemand())
-                        throw boost::thread_interrupted();
+                unsigned int nHashesDone = 0;
+
+                if (nCreateBlockAlgo == POW_SCRYPT_SQUARED) {
+                    unsigned int runs = 0;
+                    while (true) {
+                        int nHashes = 0;
+                        if (scrypt_N_1_1_256_multi(BEGIN(pblock->nVersion), hashTarget, &nHashes, scratchbuf, 1048576)) {
+                            // Found a solution
+                            SetThreadPriority(THREAD_PRIORITY_NORMAL);
+                            LogPrintf("%s:\n", __func__);
+                            LogPrintf("proof-of-work found\n   hash: %s\n target: %s\n  nonce: %i\n", pblock->GetPoWHash().GetHex(), hashTarget.GetHex(), pblock->nNonce);
+                            ProcessBlockFound(pblock, *pwallet, reservekey);
+                            SetThreadPriority(THREAD_PRIORITY_LOWEST);
+
+                            // In regression test mode, stop mining after a block is found. This
+                            // allows developers to controllably generate a block on demand.
+                            if (Params().MineBlocksOnDemand())
+                                throw boost::thread_interrupted();
 
-                    break;
+                            break;
+                        }
+                        pblock->nNonce += nHashes;
+                        nHashesDone += nHashes;
+                        if (runs & 0x1)
+                            break;
+                        runs++;
+                    }
+                } else {
+                    uint256 hash;
+                    while (true) {
+                        hash = pblock->GetPoWHash();
+                        if (hash <= hashTarget) {
+                            // Found a solution
+                            SetThreadPriority(THREAD_PRIORITY_NORMAL);
+                            LogPrintf("%s:\n", __func__);
+                            LogPrintf("proof-of-work found\n   hash: %s\n target: %s\n  nonce: %i\n", hash.GetHex(), hashTarget.GetHex(), pblock->nNonce);
+                            ProcessBlockFound(pblock, *pwallet, reservekey);
+                            SetThreadPriority(THREAD_PRIORITY_LOWEST);
+
+                            // In regression test mode, stop mining after a block is found. This
+                            // allows developers to controllably generate a block on demand.
+                            if (Params().MineBlocksOnDemand())
+                                throw boost::thread_interrupted();
+
+                            break;
+                        }
+                        pblock->nNonce += 1;
+                        nHashesDone += 1;
+                        if ((pblock->nNonce & 0xFF) == 0)
+                            break;
+                    }
                 }
-                pblock->nNonce += 1;
-                nHashesDone += 1;
-                if ((pblock->nNonce & 0xFF) == 0)
-                    break;
-            }
 
-            // Meter hashes/sec
-            static int64_t nHashCounter;
-            if (nHPSTimerStart == 0) {
-                nHPSTimerStart = GetTimeMillis();
-                nHashCounter = 0;
-            } else
-                nHashCounter += nHashesDone;
-            if (GetTimeMillis() - nHPSTimerStart > 4000) {
-                static CCriticalSection cs;
+                // Meter hashes/sec
+                static int64_t nHashCounter;
                 {
-                    LOCK(cs);
-                    if (GetTimeMillis() - nHPSTimerStart > 4000) {
-                        dHashesPerSec = 1000.0 * nHashCounter / (GetTimeMillis() - nHPSTimerStart);
-                        nHPSTimerStart = GetTimeMillis();
-                        nHashCounter = 0;
-                        static int64_t nLogTime;
-                        if (GetTime() - nLogTime > 30 * 60) {
-                            nLogTime = GetTime();
-                            LogPrintf("hashmeter %6.0f khash/s\n", dHashesPerSec / 1000.0);
+                    static CCriticalSection cs;
+                    {
+                        LOCK(cs);
+                        if (nHPSTimerStart == 0) {
+                            nHPSTimerStart = GetTimeMillis();
+                            nHashCounter = 0;
+                        } else
+                            nHashCounter += nHashesDone;
+
+                        if (GetTimeMillis() - nHPSTimerStart > 30000) {
+                            dHashesPerMin = 60000.0 * nHashCounter / (GetTimeMillis() - nHPSTimerStart);
+                            nHPSTimerStart = GetTimeMillis();
+                            nHashCounter = 0;
+                            static int64_t nLogTime;
+                            if (GetTime() - nLogTime > 120) {
+                                nLogTime = GetTime();
+                                LogPrintf("Total local hashrate %6.1f khash/min\n", dHashesPerMin/1000.0);
+                            }
                         }
                     }
                 }
-            }
 
-            // Check for stop or if block needs to be rebuilt
-            boost::this_thread::interruption_point();
-            // Regtest mode doesn't require peers
-            if (vNodes.empty() && Params().MiningRequiresPeers())
-                break;
-            if (pblock->nNonce >= 0xffff0000)
-                break;
-            if (mempool.GetTransactionsUpdated() != nTransactionsUpdatedLast && GetTime() - nStart > 60)
-                break;
-            if (pindexPrev != chainActive.Tip())
-                break;
-
-            // Update nTime every few seconds
-            UpdateTime(pblock, pindexPrev, pblock->IsProofOfStake());
-            if (Params().AllowMinDifficultyBlocks()) {
-                // Changing pblock->nTime can change work required on testnet:
-                hashTarget.SetCompact(pblock->nBits);
+                // Check for stop or if block needs to be rebuilt
+                boost::this_thread::interruption_point();
+                // Regtest mode doesn't require peers
+                if (vNodes.empty() && Params().MiningRequiresPeers())
+                    break;
+                if (pblock->nNonce >= 0xffff0000)
+                    break;
+                if (mempool.GetTransactionsUpdated() != nTransactionsUpdatedLast && GetTime() - nStart > 60)
+                    break;
+                if (pindexPrev != chainActive.Tip())
+                    break;
+
+                // Update nTime every few seconds
+                UpdateTime(pblock, pindexPrev, fProofOfStake);
+                if (Params().AllowMinDifficultyBlocks()) {
+                    // Changing pblock->nTime can change work required on testnet:
+                    hashTarget.SetCompact(pblock->nBits);
+                }
             }
         }
+    } catch (boost::thread_interrupted) {
+        free(scratchbuf);
+        //LogPrintf("SimplicityMiner terminated\n");
+        throw boost::thread_interrupted();
     }
 }
 
@@ -831,9 +876,9 @@ void static ThreadBitcoinMiner(void* parg)
         BitcoinMiner(pwallet, false);
         boost::this_thread::interruption_point();
     } catch (std::exception& e) {
-        LogPrintf("SimplicityMiner exception");
+        LogPrintf("SimplicityMiner exception\n");
     } catch (...) {
-        LogPrintf("SimplicityMiner exception");
+        LogPrintf("SimplicityMiner exception\n");
     }
 
     LogPrintf("SimplicityMiner exiting\n");
diff --git a/src/miner.h b/src/miner.h
index fe552ba..198dc46 100644
--- a/src/miner.h
+++ b/src/miner.h
@@ -38,7 +38,7 @@ void UpdateTime(CBlockHeader* block, const CBlockIndex* pindexPrev, bool fProofO
     void ThreadStakeMinter();
 #endif // ENABLE_WALLET
 
-extern double dHashesPerSec;
+extern double dHashesPerMin;
 extern int64_t nHPSTimerStart;
 
 #endif // BITCOIN_MINER_H
diff --git a/src/pow.cpp b/src/pow.cpp
index e8602eb..b881a24 100644
--- a/src/pow.cpp
+++ b/src/pow.cpp
@@ -26,25 +26,26 @@ const CBlockIndex* GetLastBlockIndex(const CBlockIndex* pindex, bool fProofOfSta
 
 const CBlockIndex* GetLastBlockIndex(const CBlockIndex* pindex, int algo)
 {
-    bool newDiff = pindex->nTime >= Params().BadScryptDiffTimeEnd();
-    while (pindex && pindex->pprev && (CBlockHeader::GetAlgo(pindex->nVersion) != algo || (newDiff && algo == POW_SCRYPT_SQUARED && pindex->nTime < Params().BadScryptDiffTimeEnd() && pindex->nTime >= Params().BadScryptDiffTimeStart())))
+    bool newDiff = algo == POW_SCRYPT_SQUARED && pindex->nTime >= Params().BadScryptDiffTimeEnd();
+    while (pindex && pindex->pprev && (CBlockHeader::GetAlgo(pindex->nVersion) != algo || (newDiff && pindex->nTime < Params().BadScryptDiffTimeEnd() && pindex->nTime >= Params().BadScryptDiffTimeStart())))
         pindex = pindex->pprev;
     return pindex;
 }
 
 unsigned int GetNextWorkRequired(const CBlockIndex* pindexLast, const CBlockHeader* pblock, bool fProofOfStake)
 {
-    uint256 bnTargetLimit = fProofOfStake ? Params().ProofOfStakeLimit() : Params().ProofOfWorkLimit(pblock->nVersion >= Params().WALLET_UPGRADE_VERSION() ? CBlockHeader::GetAlgo(pblock->nVersion) : POW_QUARK);
+    int algo = CBlockHeader::GetAlgo(pblock->nVersion);
+    uint256 bnTargetLimit = fProofOfStake ? Params().ProofOfStakeLimit() : Params().ProofOfWorkLimit(pblock->nVersion >= Params().WALLET_UPGRADE_VERSION() ? algo : POW_QUARK);
     //if (Params().NetworkID() != CBaseChainParams::MAIN && !fProofOfStake) return bnTargetLimit.GetCompact(); // for testing
 
     if (pindexLast == NULL)
         return bnTargetLimit.GetCompact(); // genesis block
 
-    const CBlockIndex* pindexPrev = pblock->nVersion >= Params().WALLET_UPGRADE_VERSION() ? GetLastBlockIndex(pindexLast, CBlockHeader::GetAlgo(pblock->nVersion)) : GetLastBlockIndex(pindexLast, fProofOfStake);
+    const CBlockIndex* pindexPrev = pblock->nVersion >= Params().WALLET_UPGRADE_VERSION() ? GetLastBlockIndex(pindexLast, algo) : GetLastBlockIndex(pindexLast, fProofOfStake);
     if (pindexPrev->pprev == NULL)
         return bnTargetLimit.GetCompact(); // first block
 
-    const CBlockIndex* pindexPrevPrev = pblock->nVersion >= Params().WALLET_UPGRADE_VERSION() ? GetLastBlockIndex(pindexPrev->pprev, CBlockHeader::GetAlgo(pblock->nVersion)) : GetLastBlockIndex(pindexPrev->pprev, fProofOfStake);
+    const CBlockIndex* pindexPrevPrev = pblock->nVersion >= Params().WALLET_UPGRADE_VERSION() ? GetLastBlockIndex(pindexPrev->pprev, algo) : GetLastBlockIndex(pindexPrev->pprev, fProofOfStake);
     if (pindexPrevPrev->pprev == NULL)
         return bnTargetLimit.GetCompact(); // second block
 
@@ -123,22 +124,26 @@ bool CheckProofOfWork(const CBlockHeader* pblock)
     if (Params().SkipProofOfWorkCheck())
         return true;
 
+    int algo = CBlockHeader::GetAlgo(pblock->nVersion);
     bnTarget.SetCompact(pblock->nBits, &fNegative, &fOverflow);
 
     // Check range
-    if (fNegative || bnTarget == 0 || fOverflow || bnTarget > Params().ProofOfWorkLimit(pblock->nVersion >= Params().WALLET_UPGRADE_VERSION() ? CBlockHeader::GetAlgo(pblock->nVersion) : POW_QUARK))
+    if (fNegative || bnTarget == 0 || fOverflow || bnTarget > Params().ProofOfWorkLimit(pblock->nVersion >= Params().WALLET_UPGRADE_VERSION() ? algo : POW_QUARK))
         return error("CheckProofOfWork() : nBits below minimum work");
 
-    if (CBlockHeader::GetAlgo(pblock->nVersion) == POW_SCRYPT_SQUARED && pblock->nTime < Params().BadScryptDiffTimeEnd() && pblock->nTime >= Params().BadScryptDiffTimeStart()) {
-        LogPrintf("CheckProofOfWork() : skipping block %s affected by difficulty bug\n", pblock->GetHash().GetHex());
+    if (algo == POW_SCRYPT_SQUARED && pblock->nTime < Params().BadScryptDiffTimeEnd() && pblock->nTime >= Params().BadScryptDiffTimeStart()) {
+        LogPrintf("CheckProofOfWork() : skipping block %s affected by scrypt difficulty bug\n", pblock->GetHash().GetHex());
         return true;
     }
-    
+
     // Check proof of work matches claimed amount
     if (pblock->GetPoWHash() > bnTarget) {
         if (Params().MineBlocksOnDemand())
             return false;
-        else
+        else if (pblock->GetHash() == Params().HashGenesisBlock() && Params().NetworkID() == CBaseChainParams::MAIN) {
+            LogPrintf("CheckProofOfWork() : accepting genesis block\n");
+            return true;
+        } else
             return error("CheckProofOfWork() : hash doesn't match nBits");
     }
 
diff --git a/src/primitives/block.h b/src/primitives/block.h
index 1a27d9d..b99941b 100644
--- a/src/primitives/block.h
+++ b/src/primitives/block.h
@@ -44,8 +44,8 @@ class CBlockHeader
 {
 public:
     // header
-    static const int32_t CURRENT_VERSION=8;
-    int32_t nVersion;
+    static const uint32_t CURRENT_VERSION = 8;
+    uint32_t nVersion;
     uint256 hashPrevBlock;
     uint256 hashMerkleRoot;
     uint32_t nTime;
@@ -121,7 +121,7 @@ class CBlockHeader
         }
     }
 
-    static int GetVer(int algo)
+    static uint32_t GetVer(int algo)
     {
         switch (algo) {
             case POS:
diff --git a/src/qt/splashscreen.cpp b/src/qt/splashscreen.cpp
index e543b8e..b29702d 100644
--- a/src/qt/splashscreen.cpp
+++ b/src/qt/splashscreen.cpp
@@ -26,9 +26,9 @@ SplashScreen::SplashScreen(Qt::WindowFlags f, const NetworkStyle* networkStyle)
 {
     // set reference point, paddings
     int paddingLeft = 14;
-    int paddingTop = 470;
+    int paddingTop = 34;
     int titleVersionVSpace = 17;
-    int titleCopyrightVSpace = 32;
+    //int titleCopyrightVSpace = 32;
 
     float fontFactor = 1.0;
 
@@ -68,10 +68,10 @@ SplashScreen::SplashScreen(Qt::WindowFlags f, const NetworkStyle* networkStyle)
 
     // draw copyright stuff
     pixPaint.setFont(QFont(font, 10 * fontFactor));
-    pixPaint.drawText(paddingLeft, paddingTop + titleCopyrightVSpace, copyrightTextBtc);
-    pixPaint.drawText(paddingLeft, paddingTop + titleCopyrightVSpace + 12, copyrightTextDash);
-    pixPaint.drawText(paddingLeft, paddingTop + titleCopyrightVSpace + 24, copyrightTextPIVX);
-    pixPaint.drawText(paddingLeft, paddingTop + titleCopyrightVSpace + 36, copyrightTextSPL);
+    pixPaint.drawText(paddingLeft + 270, paddingTop-titleVersionVSpace /*+ titleCopyrightVSpace*/, copyrightTextBtc);
+    pixPaint.drawText(paddingLeft + 270, paddingTop-titleVersionVSpace /*+ titleCopyrightVSpace*/ + 12, copyrightTextDash);
+    pixPaint.drawText(paddingLeft + 270, paddingTop-titleVersionVSpace /*+ titleCopyrightVSpace*/ + 24, copyrightTextPIVX);
+    pixPaint.drawText(paddingLeft + 270, paddingTop-titleVersionVSpace /*+ titleCopyrightVSpace*/ + 36, copyrightTextSPL);
 
     // draw additional text if special network
     if (!titleAddText.isEmpty()) {
diff --git a/src/rpc/blockchain.cpp b/src/rpc/blockchain.cpp
index 84a9067..9911fd5 100644
--- a/src/rpc/blockchain.cpp
+++ b/src/rpc/blockchain.cpp
@@ -113,7 +113,7 @@ UniValue blockheaderToJSON(const CBlockIndex* blockindex)
         confirmations = chainActive.Height() - blockindex->nHeight + 1;
     result.push_back(Pair("confirmations", confirmations));
     result.push_back(Pair("height", blockindex->nHeight));
-    result.push_back(Pair("version", blockindex->nVersion));
+    result.push_back(Pair("version", (uint64_t)blockindex->nVersion));
     result.push_back(Pair("merkleroot", blockindex->hashMerkleRoot.GetHex()));
     result.push_back(Pair("time", (int64_t)blockindex->nTime));
     result.push_back(Pair("mediantime", (int64_t)blockindex->GetMedianTimePast()));
@@ -142,7 +142,7 @@ UniValue blockToJSON(const CBlock& block, const CBlockIndex* blockindex, bool tx
     result.push_back(Pair("confirmations", confirmations));
     result.push_back(Pair("size", (int)::GetSerializeSize(block, SER_NETWORK, PROTOCOL_VERSION)));
     result.push_back(Pair("height", blockindex->nHeight));
-    result.push_back(Pair("version", block.nVersion));
+    result.push_back(Pair("version", (uint64_t)block.nVersion));
     result.push_back(Pair("merkleroot", block.hashMerkleRoot.GetHex()));
     //result.push_back(Pair("acc_checkpoint", block.nAccumulatorCheckpoint.GetHex()));
     UniValue txs(UniValue::VARR);
@@ -904,7 +904,7 @@ UniValue verifychain(const UniValue& params, bool fHelp)
 }
 
 /** Implementation of IsSuperMajority with better feedback */
-static UniValue SoftForkMajorityDesc(int minVersion, CBlockIndex* pindex, int nRequired)
+/*static UniValue SoftForkMajorityDesc(unsigned int minVersion, CBlockIndex* pindex, int nRequired)
 {
     int nFound = 0;
     CBlockIndex* pstart = pindex;
@@ -929,7 +929,7 @@ static UniValue SoftForkDesc(const std::string &name, int version, CBlockIndex*
     rv.push_back(Pair("enforce", SoftForkMajorityDesc(version, pindex, Params().EnforceBlockUpgradeMajority())));
     rv.push_back(Pair("reject", SoftForkMajorityDesc(version, pindex, Params().RejectBlockOutdatedMajority())));
     return rv;
-}
+}*/
 
 UniValue getblockchaininfo(const UniValue& params, bool fHelp)
 {
diff --git a/src/rpc/mining.cpp b/src/rpc/mining.cpp
index 7aa1488..7409862 100644
--- a/src/rpc/mining.cpp
+++ b/src/rpc/mining.cpp
@@ -203,6 +203,7 @@ UniValue setminingalgo(const UniValue& params, bool fHelp)
     if (algo <= POS || algo >= ALGO_COUNT)
         throw JSONRPCError(RPC_INVALID_PARAMETER, "Invalid algorithm");
 
+    GenerateBitcoins(false, nullptr, 0);
     LOCK(cs_main);
     nCreateBlockAlgo = algo;
 
@@ -270,9 +271,9 @@ UniValue gethashespersec(const UniValue& params, bool fHelp)
             "\nExamples:\n" +
             HelpExampleCli("gethashespersec", "") + HelpExampleRpc("gethashespersec", ""));
 
-    if (GetTimeMillis() - nHPSTimerStart > 8000)
+    if (GetTimeMillis() - nHPSTimerStart > 60000)
         return (int64_t)0;
-    return (int64_t)dHashesPerSec;
+    return (int64_t)(dHashesPerMin / 60);
 }
 #endif
 
@@ -618,7 +619,7 @@ UniValue getblocktemplate(const UniValue& params, bool fHelp)
 
     UniValue result(UniValue::VOBJ);
     result.push_back(Pair("capabilities", aCaps));
-    result.push_back(Pair("version", pblock->nVersion));
+    result.push_back(Pair("version", (uint64_t)pblock->nVersion));
     result.push_back(Pair("previousblockhash", pblock->hashPrevBlock.GetHex()));
     result.push_back(Pair("transactions", transactions));
     result.push_back(Pair("coinbaseaux", aux));
diff --git a/src/txdb.cpp b/src/txdb.cpp
index 04fe87a..33b20cd 100644
--- a/src/txdb.cpp
+++ b/src/txdb.cpp
@@ -267,8 +267,8 @@ bool CBlockTreeDB::LoadBlockIndexGuts()
                 //pindexNew->hashProofOfStake = diskindex.hashProofOfStake;
                 //pindexNew->hashProofOfWork = diskindex.hashProofOfWork;
 
-                // treat PoW and PoS blocks the same - don't waste time on redundant PoW checks that won't catch invalid PoS blocks anyway
-                if (pindexNew->GetBlockHash() != Params().HashGenesisBlock() && pindexNew->IsProofOfWork() && CBlockHeader::GetAlgo(pindexNew->nVersion) != POW_SCRYPT_SQUARED) {
+                // treat PoW and PoS blocks the same - don't waste time on redundant PoW checks that won't catch invalid PoS blocks anyway - nNonce = 0 for PoS blocks
+                if ((pindexNew->nNonce != 0 || pindexNew->nVersion >= Params().WALLET_UPGRADE_VERSION()) && pindexNew->IsProofOfWork() && CBlockHeader::GetAlgo(pindexNew->nVersion) != POW_SCRYPT_SQUARED) {
                     CBlockHeader header = pindexNew->GetBlockHeader();
                     if (!CheckProofOfWork(&header))
                         return error("LoadBlockIndex() : CheckProofOfWork failed: %s", pindexNew->ToString());
diff --git a/src/wallet/rpcdump.cpp b/src/wallet/rpcdump.cpp
index 9fbd543..65dfafb 100644
--- a/src/wallet/rpcdump.cpp
+++ b/src/wallet/rpcdump.cpp
@@ -25,8 +25,6 @@
 
 #include <univalue.h>
 
-void EnsureWalletIsUnlocked(bool fAllowAnonOnly);
-
 std::string static EncodeDumpTime(int64_t nTime)
 {
     return DateTimeStrFormat("%Y-%m-%dT%H:%M:%SZ", nTime);
diff --git a/src/wallet/wallet.cpp b/src/wallet/wallet.cpp
index 85ac87b..a4598e7 100644
--- a/src/wallet/wallet.cpp
+++ b/src/wallet/wallet.cpp
@@ -2448,7 +2448,7 @@ bool CWallet::CreateCoinStake(
                 if (outputs > 1) {
                     // Split the stake across the outputs
                     CAmount nShare = nRemaining / outputs;
-                    for (int i = 1; i < outputs; i++) {
+                    for (unsigned int i = 1; i < outputs; i++) {
                         // loop through all but the last one.
                         txNew.vout[i].nValue = nShare;
                         nRemaining -= nShare;