diff --git a/patches/gromacs-2021.7.config b/patches/gromacs-2021.7.config
deleted file mode 100644
index f69d5800af..0000000000
--- a/patches/gromacs-2021.7.config
+++ /dev/null
@@ -1,38 +0,0 @@
-
-
-function plumed_preliminary_test(){
-# check if the README contains the word GROMACS and if gromacs has been already configured
-  grep -q GROMACS README 1>/dev/null 2>/dev/null
-}
-
-function plumed_patch_info(){
-cat << EOF
-PLUMED can be incorporated into gromacs using the standard patching procedure.
-Patching must be done in the gromacs root directory  _before_ the cmake command is invoked.
-
-On clusters you may want to patch gromacs using the static version of plumed, in this case
-building gromacs can result in multiple errors. One possible solution is to configure gromacs
-with these additional options:
-
-cmake -DBUILD_SHARED_LIBS=OFF -DGMX_PREFER_STATIC_LIBS=ON
-
-To enable PLUMED in a gromacs simulation one should use
-mdrun with an extra -plumed flag. The flag can be used to
-specify the name of the PLUMED input file, e.g.:
-
-gmx mdrun -plumed plumed.dat
-
-For more information on gromacs you should visit http://www.gromacs.org
-
-EOF
-}
-
-plumed_before_patch(){
-  plumed_patch_info
-  
-  mv cmake/gmxVersionInfo.cmake cmake/gmxVersionInfo.cmake.preplumed
-  awk -v version="$PLUMED_VERSION" '/^set\(GMX_VERSION_STRING_OF_FORK/{gsub(/""/, "plumed-" version)}; {print}' cmake/gmxVersionInfo.cmake.preplumed > cmake/gmxVersionInfo.cmake
-}
-
-PLUMED_PREPLUMED_IGNORE=cmake/gmxVersionInfo.cmake
-
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/CMakeLists.txt b/patches/gromacs-2021.7.diff/src/gromacs/CMakeLists.txt
deleted file mode 100644
index 47bf3c4792..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/CMakeLists.txt
+++ /dev/null
@@ -1,479 +0,0 @@
-#
-# This file is part of the GROMACS molecular simulation package.
-#
-# Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
-# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
-# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-# and including many others, as listed in the AUTHORS file in the
-# top-level source directory and at http://www.gromacs.org.
-#
-# GROMACS is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public License
-# as published by the Free Software Foundation; either version 2.1
-# of the License, or (at your option) any later version.
-#
-# GROMACS is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with GROMACS; if not, see
-# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-#
-# If you want to redistribute modifications to GROMACS, please
-# consider that scientific software is very special. Version
-# control is crucial - bugs must be traceable. We will be happy to
-# consider code for inclusion in the official distribution, but
-# derived work must not be called official GROMACS. Details are found
-# in the README & COPYING files - if they are missing, get the
-# official version at http://www.gromacs.org.
-#
-# To help us fund GROMACS development, we humbly ask that you cite
-# the research papers on the package. Check out http://www.gromacs.org.
-
-include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
-
-set(LIBGROMACS_SOURCES)
-
-if (GMX_CLANG_CUDA)
-    include(gmxClangCudaUtils)
-endif()
-
-set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
-set_property(GLOBAL PROPERTY CUDA_SOURCES)
-set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
-set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
-
-set(libgromacs_object_library_dependencies "")
-function (_gmx_add_files_to_property PROPERTY)
-    foreach (_file ${ARGN})
-        if (IS_ABSOLUTE "${_file}")
-            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
-        else()
-            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
-                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
-        endif()
-    endforeach()
-endfunction ()
-
-function (gmx_add_libgromacs_sources)
-    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
-endfunction ()
-
-# Permit the configuration to disable compiling the many nbnxm kernels
-# and others involved in force calculations. Currently only
-# short-ranged and bonded kernels are disabled this way, but in future
-# others may be appropriate. Thus the cmake option is not specific to
-# nbnxm module.
-option(GMX_USE_SIMD_KERNELS "Whether to compile NBNXM and other SIMD kernels" ON)
-mark_as_advanced(GMX_USE_SIMD_KERNELS)
-
-# Add these contents first because linking their tests can take a lot
-# of time, so we want lots of parallel work still available after
-# linking starts.
-add_subdirectory(utility)
-# Add normal contents
-add_subdirectory(gmxlib)
-add_subdirectory(mdlib)
-add_subdirectory(applied_forces)
-add_subdirectory(listed_forces)
-add_subdirectory(nbnxm)
-add_subdirectory(commandline)
-add_subdirectory(domdec)
-add_subdirectory(ewald)
-add_subdirectory(fft)
-add_subdirectory(gpu_utils)
-add_subdirectory(hardware)
-add_subdirectory(linearalgebra)
-add_subdirectory(math)
-add_subdirectory(mdrun)
-add_subdirectory(mdrunutility)
-add_subdirectory(mdspan)
-add_subdirectory(mdtypes)
-add_subdirectory(onlinehelp)
-add_subdirectory(options)
-add_subdirectory(pbcutil)
-add_subdirectory(random)
-add_subdirectory(restraint)
-add_subdirectory(tables)
-add_subdirectory(taskassignment)
-add_subdirectory(timing)
-add_subdirectory(topology)
-add_subdirectory(trajectory)
-add_subdirectory(swap)
-add_subdirectory(essentialdynamics)
-add_subdirectory(pulling)
-add_subdirectory(simd)
-add_subdirectory(imd)
-add_subdirectory(compat)
-add_subdirectory(mimic)
-add_subdirectory(modularsimulator)
-if (NOT GMX_BUILD_MDRUN_ONLY)
-    add_subdirectory(gmxana)
-    add_subdirectory(gmxpreprocess)
-    add_subdirectory(correlationfunctions)
-    add_subdirectory(statistics)
-    add_subdirectory(analysisdata)
-    add_subdirectory(coordinateio)
-    add_subdirectory(trajectoryanalysis)
-    add_subdirectory(energyanalysis)
-    add_subdirectory(tools)
-endif()
-
-get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
-list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
-
-# This would be the standard way to include thread_mpi, but
-# we want libgromacs to link the functions directly
-#if(GMX_THREAD_MPI)
-#    add_subdirectory(thread_mpi)
-#endif()
-#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
-tmpi_get_source_list(THREAD_MPI_SOURCES ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/src)
-add_library(thread_mpi OBJECT ${THREAD_MPI_SOURCES})
-target_compile_definitions(thread_mpi PRIVATE HAVE_CONFIG_H)
-if(CYGWIN)
-    # Needs POSIX-isms for strdup, not just std-isms
-    target_compile_definitions(thread_mpi PRIVATE _POSIX_C_SOURCE=200809L)
-endif()
-gmx_target_compile_options(thread_mpi)
-if (WIN32)
-    gmx_target_warning_suppression(thread_mpi /wd4996 HAS_NO_MSVC_UNSAFE_FUNCTION)
-endif()
-list(APPEND libgromacs_object_library_dependencies thread_mpi)
-
-configure_file(version.h.cmakein version.h)
-if(GMX_INSTALL_LEGACY_API)
-  install(FILES
-          ${CMAKE_CURRENT_BINARY_DIR}/version.h
-	  analysisdata.h
-	  options.h
-	  selection.h
-	  trajectoryanalysis.h
-          DESTINATION include/gromacs)
-endif()
-
-# This code is here instead of utility/CMakeLists.txt, because CMake
-# custom commands and source file properties can only be set in the directory
-# that contains the target that uses them.
-# TODO: Generate a header instead that can be included from baseversion.cpp.
-# That probably simplifies things somewhat.
-set(GENERATED_VERSION_FILE utility/baseversion-gen.cpp)
-gmx_configure_version_file(
-    utility/baseversion-gen.cpp.cmakein ${GENERATED_VERSION_FILE}
-    REMOTE_HASH
-    EXTRA_VARS
-        GMX_SOURCE_DOI
-        GMX_RELEASE_HASH
-        GMX_SOURCE_HASH
-        )
-list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
-
-# Mark some shared GPU implementation files to compile with CUDA if needed
-if (GMX_GPU_CUDA)
-    get_property(CUDA_SOURCES GLOBAL PROPERTY CUDA_SOURCES)
-    set_source_files_properties(${CUDA_SOURCES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-endif()
-
-if (GMX_GPU_CUDA)
-    # Work around FindCUDA that prevents using target_link_libraries()
-    # with keywords otherwise...
-    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
-    if (NOT GMX_CLANG_CUDA)
-        gmx_cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
-    else()
-        add_library(libgromacs ${LIBGROMACS_SOURCES})
-    endif()
-    target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
-else()
-    add_library(libgromacs ${LIBGROMACS_SOURCES})
-endif()
-
-# Add these contents first because linking their tests can take a lot
-# of time, so we want lots of parallel work still available after
-# linking starts.
-add_subdirectory(fileio)
-add_subdirectory(selection)
-
-# Suppress a warning about our abuse of t_inputrec
-gmx_source_file_warning_suppression(mdtypes/inputrec.cpp -Wno-class-memaccess HAS_NO_CLASS_MEMACCESS)
-
-# Handle the object libraries that contain the source file
-# dependencies that need special handling because they are generated
-# or external code.
-foreach(object_library ${libgromacs_object_library_dependencies})
-    if (BUILD_SHARED_LIBS)
-        set_target_properties(${object_library} PROPERTIES POSITION_INDEPENDENT_CODE true)
-    endif()
-    target_include_directories(${object_library} SYSTEM BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/include)
-
-    # Add the sources from the object libraries to the main library.
-    target_sources(libgromacs PRIVATE $<TARGET_OBJECTS:${object_library}>)
-endforeach()
-gmx_target_compile_options(libgromacs)
-target_compile_definitions(libgromacs PRIVATE HAVE_CONFIG_H)
-target_include_directories(libgromacs SYSTEM BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/include)
-
-if (GMX_GPU_OPENCL)
-    option(GMX_EXTERNAL_CLFFT "True if an external clFFT is required to be used" FALSE)
-    mark_as_advanced(GMX_EXTERNAL_CLFFT)
-
-    # Default to using clFFT found on the system
-    # switch to quiet at the second run.
-    if (DEFINED clFFT_LIBRARY)
-        set (clFFT_FIND_QUIETLY TRUE)
-    endif()
-    find_package(clFFT)
-    if (NOT clFFT_FOUND)
-        if (GMX_EXTERNAL_CLFFT)
-            message(FATAL_ERROR "Did not find required external clFFT library, consider setting clFFT_ROOT_DIR")
-        endif()
-
-        if(MSVC)
-            message(FATAL_ERROR
-"An OpenCL build was requested with Visual Studio compiler, but GROMACS
-requires clFFT, which was not found on your system. GROMACS does bundle
-clFFT to help with building for OpenCL, but that clFFT has not yet been
-ported to the more recent versions of that compiler that GROMACS itself
-requires. Thus for now, OpenCL is not available with MSVC and the internal
-build of clFFT in GROMACS 2019. Either change compiler, try installing
-a clFFT package, or use the latest GROMACS 2018 point release.")
-        endif()
-
-        # Fall back on the internal version
-        set (_clFFT_dir ../external/clFFT/src)
-        add_subdirectory(${_clFFT_dir} clFFT-build)
-        target_sources(libgromacs PRIVATE
-            $<TARGET_OBJECTS:clFFT>
-        )
-        target_include_directories(libgromacs SYSTEM PRIVATE ${_clFFT_dir}/include)
-        # Use the magic variable for how to link any library needed for
-        # dlopen, etc.  which is -ldl where needed, and empty otherwise
-        # (e.g. Windows, BSD, Mac).
-        target_link_libraries(libgromacs PRIVATE "${CMAKE_DL_LIBS}")
-    else()
-        target_link_libraries(libgromacs PRIVATE clFFT)
-    endif()
-endif()
-
-# Permit GROMACS code to include externally developed headers, such as
-# the functionality from the nonstd project that we use for
-# gmx::compat::optional. These are included as system headers so that
-# no warnings are issued from them.
-#
-# TODO Perhaps generalize this for all headers from src/external
-target_include_directories(libgromacs SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/src/external)
-
-if(SIMD_AVX_512_CXX_SUPPORTED AND NOT ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512_KNL"))
-    # Since we might be overriding -march=core-avx2, add a flag so we don't warn for this specific file.
-    # On KNL this can cause illegal instruction because the compiler might use non KNL AVX instructions
-    # with the SIMD_AVX_512_CXX_FLAGS flags.
-    set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS} ${CXX_NO_UNUSED_OPTION_WARNING_FLAGS}")
-endif()
-
-# Do any special handling needed for .cpp files that use
-# CUDA runtime headers
-if (GMX_GPU_CUDA AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    # CUDA header cuda_runtime_api.h in at least CUDA 10.1 uses 0
-    # where nullptr would be preferable. GROMACS can't fix these, so
-    # must suppress them.
-    GMX_TEST_CXXFLAG(CXXFLAGS_NO_ZERO_AS_NULL_POINTER_CONSTANT "-Wno-zero-as-null-pointer-constant" NVCC_CLANG_SUPPRESSIONS_CXXFLAGS)
-
-    foreach(_compile_flag ${NVCC_CLANG_SUPPRESSIONS_CXXFLAGS})
-        set(GMX_CUDA_CLANG_FLAGS "${GMX_CUDA_CLANG_FLAGS} ${_compile_flag}")
-    endforeach()
-    if (GMX_CLANG_CUDA)
-        foreach (_file ${LIBGROMACS_SOURCES})
-            get_filename_component(_ext ${_file} EXT)
-            get_source_file_property(_cuda_source_format ${_file} CUDA_SOURCE_PROPERTY_FORMAT)
-            if ("${_ext}" STREQUAL ".cu" OR _cuda_source_format)
-                gmx_compile_cuda_file_with_clang(${_file})
-            endif()
-        endforeach()
-    else()
-        get_property(CUDA_SOURCES GLOBAL PROPERTY CUDA_SOURCES)
-        set_source_files_properties(${CUDA_SOURCES} PROPERTIES COMPILE_FLAGS ${GMX_CUDA_CLANG_FLAGS})
-    endif()
-endif()
-
-# Only add the -fsycl flag to sources that really need it
-if (GMX_GPU_SYCL)
-    get_property(SYCL_SOURCES GLOBAL PROPERTY SYCL_SOURCES)
-    set_source_files_properties(${SYCL_SOURCES} PROPERTIES COMPILE_FLAGS "${SYCL_CXX_FLAGS}")
-endif()
-
-gmx_setup_tng_for_libgromacs()
-
-# We apply the SYCL flag explicitly just for libgromacs, since bugs in the beta versions of
-# icpx/dpcpp leads to crashes if we try to link an library without any SYCL code with the
-# -fsycl flag enabled. Once that bug is fixed, we should change it to simply add
-# SYCL_CXX_FLAGS to GMX_SHARED_LINKER_FLAGS.
-target_link_libraries(libgromacs
-                      PRIVATE
-                      ${EXTRAE_LIBRARIES}
-                      ${GMX_EXTRA_LIBRARIES}
-                      ${GMX_COMMON_LIBRARIES}
-                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
-                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS}
-                      ${SYCL_CXX_FLAGS}
-                      ${OpenCL_LIBRARIES}
-                      $<$<PLATFORM_ID:SunOS>:socket>
-                      PUBLIC
-                      ${GMX_PUBLIC_LIBRARIES}
-                      ${PLUMED_LOAD}
-                      )
-if (GMX_OPENMP)
-    target_link_libraries(libgromacs PUBLIC OpenMP::OpenMP_CXX)
-endif()
-set_target_properties(libgromacs PROPERTIES
-                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
-                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
-                      VERSION ${LIBRARY_VERSION}
-                      )
-
-gmx_manage_lmfit()
-target_link_libraries(libgromacs PRIVATE lmfit)
-
-# Make sure we fix "everything" found by more recent versions of clang.
-if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7")
-   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Weverything ${IGNORED_CLANG_ALL_WARNINGS}>)
-endif()
-if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/analyze /analyze:stacksize 70000
-     #Control flow warnings are disabled because the commond line output is insufficient. There is no tool
-     #to convert the xml report to e.g. HTML and even in Visual Studio the viewer doesn't work with cmake support.
-     /wd6001  #unitialized memory
-     /wd6011  #derefencing NULL
-     /wd6053  #prior call not zero-terminate
-     /wd6054  #might not be zero-terminated
-     /wd6385  #reading invalid data
-     /wd6386  #buffer overrun
-     /wd6387  #could be '0'
-     /wd28199 #uninitialized memory
-     # For compile time constant (e.g. templates) the following warnings have flase postives
-     /wd6239  #(<non-zero> && <expr>)
-     /wd6240  #(<expr> && <non-zero>)
-     /wd6294  #Ill-defined for-loop
-     /wd6326  #comparison of constant with other constant
-     /wd28020 #expression involving paramter is not true
-     # Misc
-     /wd6330  #incorrect type to function (warns for char (instead of unsigned) for isspace/isalpha/isdigit/..))
-     /wd6993  #OpenMP ignored
-     #TODO
-     /wd6031  #return value ignored (important - mostly warnigns about sscanf)
-     /wd6244  #hides declaration (known issue - we ingore similar warnings for other compilers)
-     /wd6246  #hides declaration
-     >
-   )
-endif()
-
-if (GMX_CLANG_TIDY)
-   set_target_properties(libgromacs PROPERTIES CXX_CLANG_TIDY
-       "${CLANG_TIDY_EXE};-warnings-as-errors=*")
-endif()
-
-# clang-3.6 warns about a number of issues that are not reported by more modern compilers
-# and we know they are not real issues. So we only check that it can compile without error
-# but ignore all warnings.
-if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^3\.6")
-    target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-w>)
-endif()
-
-# Only install the library in mdrun-only mode if it is actually necessary
-# for the binary
-if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
-    install(TARGETS libgromacs
-            EXPORT libgromacs
-            LIBRARY
-                DESTINATION ${CMAKE_INSTALL_LIBDIR}
-                COMPONENT libraries
-            RUNTIME
-                DESTINATION ${CMAKE_INSTALL_BINDIR}
-                COMPONENT libraries
-            ARCHIVE
-                DESTINATION ${CMAKE_INSTALL_LIBDIR}
-                COMPONENT libraries
-            INCLUDES DESTINATION include)
-    target_compile_definitions(libgromacs PUBLIC $<INSTALL_INTERFACE:GMX_DOUBLE=${GMX_DOUBLE_VALUE}>)
-    # legacy headers use c++17 features, so consumer codes need to use that standard, too
-    if(GMX_INSTALL_LEGACY_API)
-        target_compile_features(libgromacs INTERFACE cxx_std_${CMAKE_CXX_STANDARD})
-    endif()
-    add_library(Gromacs::libgromacs ALIAS libgromacs)
-endif()
-
-if (NOT GMX_BUILD_MDRUN_ONLY)
-    include(InstallLibInfo.cmake)
-endif()
-
-# Technically, the user could want to do this for an OpenCL build
-# using the CUDA runtime, but currently there's no reason to want to
-# do that.
-if (INSTALL_CUDART_LIB) #can be set manual by user
-    if (GMX_GPU_CUDA)
-        foreach(CUDA_LIB ${CUDA_LIBRARIES})
-            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
-            if(IS_CUDART) #libcuda should not be installed
-                #install also name-links (linker uses those)
-                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
-                install(FILES ${CUDA_LIBS} DESTINATION
-                    ${CMAKE_INSTALL_LIBDIR} COMPONENT libraries)
-            endif()
-        endforeach()
-    else()
-        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
-    endif()
-endif()
-
-if(GMX_GPU_OPENCL)
-    # Install the utility headers
-    file(GLOB OPENCL_INSTALLED_FILES
-        gpu_utils/vectype_ops.clh
-        gpu_utils/device_utils.clh
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/gpu_utils
-        COMPONENT libraries)
-    file(GLOB OPENCL_INSTALLED_FILES
-        pbcutil/ishift.h
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/pbcutil
-        COMPONENT libraries)
-
-    # Install the NBNXM source and headers
-    file(GLOB OPENCL_INSTALLED_FILES
-        nbnxm/constants.h
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/nbnxm
-        COMPONENT libraries)
-    file(GLOB OPENCL_INSTALLED_FILES
-        nbnxm/opencl/nbnxm_ocl_kernels.cl
-        nbnxm/opencl/nbnxm_ocl_kernel.clh
-        nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh
-        nbnxm/opencl/nbnxm_ocl_kernels.clh
-        nbnxm/opencl/nbnxm_ocl_kernels_fastgen.clh
-        nbnxm/opencl/nbnxm_ocl_kernels_fastgen_add_twincut.clh
-        nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
-        nbnxm/opencl/nbnxm_ocl_consts.h
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/nbnxm/opencl
-        COMPONENT libraries)
-
-    # Install the PME source and headers
-    file(GLOB OPENCL_INSTALLED_FILES
-        ewald/pme_spread.clh
-        ewald/pme_solve.clh
-        ewald/pme_gather.clh
-        ewald/pme_gpu_calculate_splines.clh
-        ewald/pme_program.cl
-        ewald/pme_gpu_types.h
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/ewald
-        COMPONENT libraries)
-endif()
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/CMakeLists.txt.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/CMakeLists.txt.preplumed
deleted file mode 100644
index a4430e9dd6..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/CMakeLists.txt.preplumed
+++ /dev/null
@@ -1,476 +0,0 @@
-#
-# This file is part of the GROMACS molecular simulation package.
-#
-# Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
-# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
-# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-# and including many others, as listed in the AUTHORS file in the
-# top-level source directory and at http://www.gromacs.org.
-#
-# GROMACS is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public License
-# as published by the Free Software Foundation; either version 2.1
-# of the License, or (at your option) any later version.
-#
-# GROMACS is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with GROMACS; if not, see
-# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-#
-# If you want to redistribute modifications to GROMACS, please
-# consider that scientific software is very special. Version
-# control is crucial - bugs must be traceable. We will be happy to
-# consider code for inclusion in the official distribution, but
-# derived work must not be called official GROMACS. Details are found
-# in the README & COPYING files - if they are missing, get the
-# official version at http://www.gromacs.org.
-#
-# To help us fund GROMACS development, we humbly ask that you cite
-# the research papers on the package. Check out http://www.gromacs.org.
-
-set(LIBGROMACS_SOURCES)
-
-if (GMX_CLANG_CUDA)
-    include(gmxClangCudaUtils)
-endif()
-
-set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
-set_property(GLOBAL PROPERTY CUDA_SOURCES)
-set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
-set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
-
-set(libgromacs_object_library_dependencies "")
-function (_gmx_add_files_to_property PROPERTY)
-    foreach (_file ${ARGN})
-        if (IS_ABSOLUTE "${_file}")
-            set_property(GLOBAL APPEND PROPERTY ${PROPERTY} ${_file})
-        else()
-            set_property(GLOBAL APPEND PROPERTY ${PROPERTY}
-                         ${CMAKE_CURRENT_LIST_DIR}/${_file})
-        endif()
-    endforeach()
-endfunction ()
-
-function (gmx_add_libgromacs_sources)
-    _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
-endfunction ()
-
-# Permit the configuration to disable compiling the many nbnxm kernels
-# and others involved in force calculations. Currently only
-# short-ranged and bonded kernels are disabled this way, but in future
-# others may be appropriate. Thus the cmake option is not specific to
-# nbnxm module.
-option(GMX_USE_SIMD_KERNELS "Whether to compile NBNXM and other SIMD kernels" ON)
-mark_as_advanced(GMX_USE_SIMD_KERNELS)
-
-# Add these contents first because linking their tests can take a lot
-# of time, so we want lots of parallel work still available after
-# linking starts.
-add_subdirectory(utility)
-# Add normal contents
-add_subdirectory(gmxlib)
-add_subdirectory(mdlib)
-add_subdirectory(applied_forces)
-add_subdirectory(listed_forces)
-add_subdirectory(nbnxm)
-add_subdirectory(commandline)
-add_subdirectory(domdec)
-add_subdirectory(ewald)
-add_subdirectory(fft)
-add_subdirectory(gpu_utils)
-add_subdirectory(hardware)
-add_subdirectory(linearalgebra)
-add_subdirectory(math)
-add_subdirectory(mdrun)
-add_subdirectory(mdrunutility)
-add_subdirectory(mdspan)
-add_subdirectory(mdtypes)
-add_subdirectory(onlinehelp)
-add_subdirectory(options)
-add_subdirectory(pbcutil)
-add_subdirectory(random)
-add_subdirectory(restraint)
-add_subdirectory(tables)
-add_subdirectory(taskassignment)
-add_subdirectory(timing)
-add_subdirectory(topology)
-add_subdirectory(trajectory)
-add_subdirectory(swap)
-add_subdirectory(essentialdynamics)
-add_subdirectory(pulling)
-add_subdirectory(simd)
-add_subdirectory(imd)
-add_subdirectory(compat)
-add_subdirectory(mimic)
-add_subdirectory(modularsimulator)
-if (NOT GMX_BUILD_MDRUN_ONLY)
-    add_subdirectory(gmxana)
-    add_subdirectory(gmxpreprocess)
-    add_subdirectory(correlationfunctions)
-    add_subdirectory(statistics)
-    add_subdirectory(analysisdata)
-    add_subdirectory(coordinateio)
-    add_subdirectory(trajectoryanalysis)
-    add_subdirectory(energyanalysis)
-    add_subdirectory(tools)
-endif()
-
-get_property(PROPERTY_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
-list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES} ${PROPERTY_SOURCES})
-
-# This would be the standard way to include thread_mpi, but
-# we want libgromacs to link the functions directly
-#if(GMX_THREAD_MPI)
-#    add_subdirectory(thread_mpi)
-#endif()
-#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
-tmpi_get_source_list(THREAD_MPI_SOURCES ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/src)
-add_library(thread_mpi OBJECT ${THREAD_MPI_SOURCES})
-target_compile_definitions(thread_mpi PRIVATE HAVE_CONFIG_H)
-if(CYGWIN)
-    # Needs POSIX-isms for strdup, not just std-isms
-    target_compile_definitions(thread_mpi PRIVATE _POSIX_C_SOURCE=200809L)
-endif()
-gmx_target_compile_options(thread_mpi)
-if (WIN32)
-    gmx_target_warning_suppression(thread_mpi /wd4996 HAS_NO_MSVC_UNSAFE_FUNCTION)
-endif()
-list(APPEND libgromacs_object_library_dependencies thread_mpi)
-
-configure_file(version.h.cmakein version.h)
-if(GMX_INSTALL_LEGACY_API)
-  install(FILES
-          ${CMAKE_CURRENT_BINARY_DIR}/version.h
-	  analysisdata.h
-	  options.h
-	  selection.h
-	  trajectoryanalysis.h
-          DESTINATION include/gromacs)
-endif()
-
-# This code is here instead of utility/CMakeLists.txt, because CMake
-# custom commands and source file properties can only be set in the directory
-# that contains the target that uses them.
-# TODO: Generate a header instead that can be included from baseversion.cpp.
-# That probably simplifies things somewhat.
-set(GENERATED_VERSION_FILE utility/baseversion-gen.cpp)
-gmx_configure_version_file(
-    utility/baseversion-gen.cpp.cmakein ${GENERATED_VERSION_FILE}
-    REMOTE_HASH
-    EXTRA_VARS
-        GMX_SOURCE_DOI
-        GMX_RELEASE_HASH
-        GMX_SOURCE_HASH
-        )
-list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
-
-# Mark some shared GPU implementation files to compile with CUDA if needed
-if (GMX_GPU_CUDA)
-    get_property(CUDA_SOURCES GLOBAL PROPERTY CUDA_SOURCES)
-    set_source_files_properties(${CUDA_SOURCES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-endif()
-
-if (GMX_GPU_CUDA)
-    # Work around FindCUDA that prevents using target_link_libraries()
-    # with keywords otherwise...
-    set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
-    if (NOT GMX_CLANG_CUDA)
-        gmx_cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
-    else()
-        add_library(libgromacs ${LIBGROMACS_SOURCES})
-    endif()
-    target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
-else()
-    add_library(libgromacs ${LIBGROMACS_SOURCES})
-endif()
-
-# Add these contents first because linking their tests can take a lot
-# of time, so we want lots of parallel work still available after
-# linking starts.
-add_subdirectory(fileio)
-add_subdirectory(selection)
-
-# Suppress a warning about our abuse of t_inputrec
-gmx_source_file_warning_suppression(mdtypes/inputrec.cpp -Wno-class-memaccess HAS_NO_CLASS_MEMACCESS)
-
-# Handle the object libraries that contain the source file
-# dependencies that need special handling because they are generated
-# or external code.
-foreach(object_library ${libgromacs_object_library_dependencies})
-    if (BUILD_SHARED_LIBS)
-        set_target_properties(${object_library} PROPERTIES POSITION_INDEPENDENT_CODE true)
-    endif()
-    target_include_directories(${object_library} SYSTEM BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/include)
-
-    # Add the sources from the object libraries to the main library.
-    target_sources(libgromacs PRIVATE $<TARGET_OBJECTS:${object_library}>)
-endforeach()
-gmx_target_compile_options(libgromacs)
-target_compile_definitions(libgromacs PRIVATE HAVE_CONFIG_H)
-target_include_directories(libgromacs SYSTEM BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/include)
-
-if (GMX_GPU_OPENCL)
-    option(GMX_EXTERNAL_CLFFT "True if an external clFFT is required to be used" FALSE)
-    mark_as_advanced(GMX_EXTERNAL_CLFFT)
-
-    # Default to using clFFT found on the system
-    # switch to quiet at the second run.
-    if (DEFINED clFFT_LIBRARY)
-        set (clFFT_FIND_QUIETLY TRUE)
-    endif()
-    find_package(clFFT)
-    if (NOT clFFT_FOUND)
-        if (GMX_EXTERNAL_CLFFT)
-            message(FATAL_ERROR "Did not find required external clFFT library, consider setting clFFT_ROOT_DIR")
-        endif()
-
-        if(MSVC)
-            message(FATAL_ERROR
-"An OpenCL build was requested with Visual Studio compiler, but GROMACS
-requires clFFT, which was not found on your system. GROMACS does bundle
-clFFT to help with building for OpenCL, but that clFFT has not yet been
-ported to the more recent versions of that compiler that GROMACS itself
-requires. Thus for now, OpenCL is not available with MSVC and the internal
-build of clFFT in GROMACS 2019. Either change compiler, try installing
-a clFFT package, or use the latest GROMACS 2018 point release.")
-        endif()
-
-        # Fall back on the internal version
-        set (_clFFT_dir ../external/clFFT/src)
-        add_subdirectory(${_clFFT_dir} clFFT-build)
-        target_sources(libgromacs PRIVATE
-            $<TARGET_OBJECTS:clFFT>
-        )
-        target_include_directories(libgromacs SYSTEM PRIVATE ${_clFFT_dir}/include)
-        # Use the magic variable for how to link any library needed for
-        # dlopen, etc.  which is -ldl where needed, and empty otherwise
-        # (e.g. Windows, BSD, Mac).
-        target_link_libraries(libgromacs PRIVATE "${CMAKE_DL_LIBS}")
-    else()
-        target_link_libraries(libgromacs PRIVATE clFFT)
-    endif()
-endif()
-
-# Permit GROMACS code to include externally developed headers, such as
-# the functionality from the nonstd project that we use for
-# gmx::compat::optional. These are included as system headers so that
-# no warnings are issued from them.
-#
-# TODO Perhaps generalize this for all headers from src/external
-target_include_directories(libgromacs SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/src/external)
-
-if(SIMD_AVX_512_CXX_SUPPORTED AND NOT ("${GMX_SIMD_ACTIVE}" STREQUAL "AVX_512_KNL"))
-    # Since we might be overriding -march=core-avx2, add a flag so we don't warn for this specific file.
-    # On KNL this can cause illegal instruction because the compiler might use non KNL AVX instructions
-    # with the SIMD_AVX_512_CXX_FLAGS flags.
-    set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS} ${CXX_NO_UNUSED_OPTION_WARNING_FLAGS}")
-endif()
-
-# Do any special handling needed for .cpp files that use
-# CUDA runtime headers
-if (GMX_GPU_CUDA AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    # CUDA header cuda_runtime_api.h in at least CUDA 10.1 uses 0
-    # where nullptr would be preferable. GROMACS can't fix these, so
-    # must suppress them.
-    GMX_TEST_CXXFLAG(CXXFLAGS_NO_ZERO_AS_NULL_POINTER_CONSTANT "-Wno-zero-as-null-pointer-constant" NVCC_CLANG_SUPPRESSIONS_CXXFLAGS)
-
-    foreach(_compile_flag ${NVCC_CLANG_SUPPRESSIONS_CXXFLAGS})
-        set(GMX_CUDA_CLANG_FLAGS "${GMX_CUDA_CLANG_FLAGS} ${_compile_flag}")
-    endforeach()
-    if (GMX_CLANG_CUDA)
-        foreach (_file ${LIBGROMACS_SOURCES})
-            get_filename_component(_ext ${_file} EXT)
-            get_source_file_property(_cuda_source_format ${_file} CUDA_SOURCE_PROPERTY_FORMAT)
-            if ("${_ext}" STREQUAL ".cu" OR _cuda_source_format)
-                gmx_compile_cuda_file_with_clang(${_file})
-            endif()
-        endforeach()
-    else()
-        get_property(CUDA_SOURCES GLOBAL PROPERTY CUDA_SOURCES)
-        set_source_files_properties(${CUDA_SOURCES} PROPERTIES COMPILE_FLAGS ${GMX_CUDA_CLANG_FLAGS})
-    endif()
-endif()
-
-# Only add the -fsycl flag to sources that really need it
-if (GMX_GPU_SYCL)
-    get_property(SYCL_SOURCES GLOBAL PROPERTY SYCL_SOURCES)
-    set_source_files_properties(${SYCL_SOURCES} PROPERTIES COMPILE_FLAGS "${SYCL_CXX_FLAGS}")
-endif()
-
-gmx_setup_tng_for_libgromacs()
-
-# We apply the SYCL flag explicitly just for libgromacs, since bugs in the beta versions of
-# icpx/dpcpp leads to crashes if we try to link an library without any SYCL code with the
-# -fsycl flag enabled. Once that bug is fixed, we should change it to simply add
-# SYCL_CXX_FLAGS to GMX_SHARED_LINKER_FLAGS.
-target_link_libraries(libgromacs
-                      PRIVATE
-                      ${EXTRAE_LIBRARIES}
-                      ${GMX_EXTRA_LIBRARIES}
-                      ${GMX_COMMON_LIBRARIES}
-                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
-                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS}
-                      ${SYCL_CXX_FLAGS}
-                      ${OpenCL_LIBRARIES}
-                      $<$<PLATFORM_ID:SunOS>:socket>
-                      PUBLIC
-                      ${GMX_PUBLIC_LIBRARIES}
-                      )
-if (GMX_OPENMP)
-    target_link_libraries(libgromacs PUBLIC OpenMP::OpenMP_CXX)
-endif()
-set_target_properties(libgromacs PROPERTIES
-                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
-                      SOVERSION ${LIBRARY_SOVERSION_MAJOR}
-                      VERSION ${LIBRARY_VERSION}
-                      )
-
-gmx_manage_lmfit()
-target_link_libraries(libgromacs PRIVATE lmfit)
-
-# Make sure we fix "everything" found by more recent versions of clang.
-if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7")
-   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Weverything ${IGNORED_CLANG_ALL_WARNINGS}>)
-endif()
-if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-   target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/analyze /analyze:stacksize 70000
-     #Control flow warnings are disabled because the commond line output is insufficient. There is no tool
-     #to convert the xml report to e.g. HTML and even in Visual Studio the viewer doesn't work with cmake support.
-     /wd6001  #unitialized memory
-     /wd6011  #derefencing NULL
-     /wd6053  #prior call not zero-terminate
-     /wd6054  #might not be zero-terminated
-     /wd6385  #reading invalid data
-     /wd6386  #buffer overrun
-     /wd6387  #could be '0'
-     /wd28199 #uninitialized memory
-     # For compile time constant (e.g. templates) the following warnings have flase postives
-     /wd6239  #(<non-zero> && <expr>)
-     /wd6240  #(<expr> && <non-zero>)
-     /wd6294  #Ill-defined for-loop
-     /wd6326  #comparison of constant with other constant
-     /wd28020 #expression involving paramter is not true
-     # Misc
-     /wd6330  #incorrect type to function (warns for char (instead of unsigned) for isspace/isalpha/isdigit/..))
-     /wd6993  #OpenMP ignored
-     #TODO
-     /wd6031  #return value ignored (important - mostly warnigns about sscanf)
-     /wd6244  #hides declaration (known issue - we ingore similar warnings for other compilers)
-     /wd6246  #hides declaration
-     >
-   )
-endif()
-
-if (GMX_CLANG_TIDY)
-   set_target_properties(libgromacs PROPERTIES CXX_CLANG_TIDY
-       "${CLANG_TIDY_EXE};-warnings-as-errors=*")
-endif()
-
-# clang-3.6 warns about a number of issues that are not reported by more modern compilers
-# and we know they are not real issues. So we only check that it can compile without error
-# but ignore all warnings.
-if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION MATCHES "^3\.6")
-    target_compile_options(libgromacs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-w>)
-endif()
-
-# Only install the library in mdrun-only mode if it is actually necessary
-# for the binary
-if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
-    install(TARGETS libgromacs
-            EXPORT libgromacs
-            LIBRARY
-                DESTINATION ${CMAKE_INSTALL_LIBDIR}
-                COMPONENT libraries
-            RUNTIME
-                DESTINATION ${CMAKE_INSTALL_BINDIR}
-                COMPONENT libraries
-            ARCHIVE
-                DESTINATION ${CMAKE_INSTALL_LIBDIR}
-                COMPONENT libraries
-            INCLUDES DESTINATION include)
-    target_compile_definitions(libgromacs PUBLIC $<INSTALL_INTERFACE:GMX_DOUBLE=${GMX_DOUBLE_VALUE}>)
-    # legacy headers use c++17 features, so consumer codes need to use that standard, too
-    if(GMX_INSTALL_LEGACY_API)
-        target_compile_features(libgromacs INTERFACE cxx_std_${CMAKE_CXX_STANDARD})
-    endif()
-    add_library(Gromacs::libgromacs ALIAS libgromacs)
-endif()
-
-if (NOT GMX_BUILD_MDRUN_ONLY)
-    include(InstallLibInfo.cmake)
-endif()
-
-# Technically, the user could want to do this for an OpenCL build
-# using the CUDA runtime, but currently there's no reason to want to
-# do that.
-if (INSTALL_CUDART_LIB) #can be set manual by user
-    if (GMX_GPU_CUDA)
-        foreach(CUDA_LIB ${CUDA_LIBRARIES})
-            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
-            if(IS_CUDART) #libcuda should not be installed
-                #install also name-links (linker uses those)
-                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
-                install(FILES ${CUDA_LIBS} DESTINATION
-                    ${CMAKE_INSTALL_LIBDIR} COMPONENT libraries)
-            endif()
-        endforeach()
-    else()
-        message(WARNING "INSTALL_CUDART_LIB only makes sense when configuring for CUDA support")
-    endif()
-endif()
-
-if(GMX_GPU_OPENCL)
-    # Install the utility headers
-    file(GLOB OPENCL_INSTALLED_FILES
-        gpu_utils/vectype_ops.clh
-        gpu_utils/device_utils.clh
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/gpu_utils
-        COMPONENT libraries)
-    file(GLOB OPENCL_INSTALLED_FILES
-        pbcutil/ishift.h
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/pbcutil
-        COMPONENT libraries)
-
-    # Install the NBNXM source and headers
-    file(GLOB OPENCL_INSTALLED_FILES
-        nbnxm/constants.h
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/nbnxm
-        COMPONENT libraries)
-    file(GLOB OPENCL_INSTALLED_FILES
-        nbnxm/opencl/nbnxm_ocl_kernels.cl
-        nbnxm/opencl/nbnxm_ocl_kernel.clh
-        nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh
-        nbnxm/opencl/nbnxm_ocl_kernels.clh
-        nbnxm/opencl/nbnxm_ocl_kernels_fastgen.clh
-        nbnxm/opencl/nbnxm_ocl_kernels_fastgen_add_twincut.clh
-        nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
-        nbnxm/opencl/nbnxm_ocl_consts.h
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/nbnxm/opencl
-        COMPONENT libraries)
-
-    # Install the PME source and headers
-    file(GLOB OPENCL_INSTALLED_FILES
-        ewald/pme_spread.clh
-        ewald/pme_solve.clh
-        ewald/pme_gather.clh
-        ewald/pme_gpu_calculate_splines.clh
-        ewald/pme_program.cl
-        ewald/pme_gpu_types.h
-        )
-    install(FILES ${OPENCL_INSTALLED_FILES}
-        DESTINATION ${GMX_INSTALL_OCLDIR}/gromacs/ewald
-        COMPONENT libraries)
-endif()
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.cpp b/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.cpp
deleted file mode 100644
index 14924afbc1..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.cpp
+++ /dev/null
@@ -1,1646 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012-2018, The GROMACS development team.
- * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/* PLUMED */
-#include "../../../Plumed.h"
-extern int    plumedswitch;
-extern plumed plumedmain;
-/* END PLUMED */
-
-#include "gmxpre.h"
-
-#include "expanded.h"
-
-#include <cmath>
-#include <cstdio>
-
-#include <algorithm>
-
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/fileio/confio.h"
-#include "gromacs/fileio/gmxfio.h"
-#include "gromacs/fileio/xtcio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/orires.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/calcmu.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdtypes/enerdata.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/random/threefry.h"
-#include "gromacs/random/uniformrealdistribution.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/smalloc.h"
-
-#include "expanded_internal.h"
-
-static void init_df_history_weights(df_history_t* dfhist, const t_expanded* expand, int nlim)
-{
-    int i;
-    dfhist->wl_delta = expand->init_wl_delta;
-    for (i = 0; i < nlim; i++)
-    {
-        dfhist->sum_weights[i] = expand->init_lambda_weights[i];
-        dfhist->sum_dg[i]      = expand->init_lambda_weights[i];
-    }
-}
-
-/* Eventually should contain all the functions needed to initialize expanded ensemble
-   before the md loop starts */
-void init_expanded_ensemble(gmx_bool bStateFromCP, const t_inputrec* ir, df_history_t* dfhist, const gmx::MDLogger& mdlog)
-{
-    if (!bStateFromCP)
-    {
-        init_df_history_weights(dfhist, ir->expandedvals, ir->fepvals->n_lambda);
-    }
-    if (plumedswitch)
-    {
-        if (ir->expandedvals->elamstats == elamstatsNO)
-        {
-            // No weight updating was chosen, use PLUMED weights
-            int plumedVersion=0;
-            plumed_cmd(plumedmain, "getApiVersion", &plumedVersion);
-            GMX_RELEASE_ASSERT(
-                    plumedVersion >= 9,
-                    "Please use PLUMED v2.8 or newer to use alchemical metadynamics with expanded ensemble");
-
-            GMX_LOG(mdlog.info).asParagraph().appendText(
-                    "You requested an expanded ensemble simulation with lmc-stats = no and activated PLUMED.\n"
-                    "As a result, this simulation will use the bias provided by PLUMED and ignore all\n"
-                    "expanded ensemble settings related to weight updates.\n"
-                    "If you want to use lambda weights updated by GROMACS in the expanded ensemble calculation,\n"
-                    "set lmc-stats != no.");
-        }
-        else
-        {
-            GMX_LOG(mdlog.info).asParagraph().appendText(
-                    "You requested an expanded ensemble simulation with lmc-stats != no and activated PLUMED.\n"
-                    "As a result, this simulation will use lambda weights managed by GROMACS and will not\n"
-                    "explicitly use the PLUMED bias in the expanded ensemble calculation.\n"
-                    "If you want to use the PLUMED bias as lambda weights, set lmc-stats = no.");
-        }
-    }
-}
-
-static void GenerateGibbsProbabilities(const real* ene, double* p_k, double* pks, int minfep, int maxfep)
-{
-
-    int  i;
-    real maxene;
-
-    *pks   = 0.0;
-    maxene = ene[minfep];
-    /* find the maximum value */
-    for (i = minfep; i <= maxfep; i++)
-    {
-        if (ene[i] > maxene)
-        {
-            maxene = ene[i];
-        }
-    }
-    /* find the denominator */
-    for (i = minfep; i <= maxfep; i++)
-    {
-        *pks += std::exp(ene[i] - maxene);
-    }
-    /*numerators*/
-    for (i = minfep; i <= maxfep; i++)
-    {
-        p_k[i] = std::exp(ene[i] - maxene) / *pks;
-    }
-}
-
-static void
-GenerateWeightedGibbsProbabilities(const real* ene, double* p_k, double* pks, int nlim, real* nvals, real delta)
-{
-
-    int   i;
-    real  maxene;
-    real* nene;
-    *pks = 0.0;
-
-    snew(nene, nlim);
-    for (i = 0; i < nlim; i++)
-    {
-        if (nvals[i] == 0)
-        {
-            /* add the delta, since we need to make sure it's greater than zero, and
-               we need a non-arbitrary number? */
-            nene[i] = ene[i] + std::log(nvals[i] + delta);
-        }
-        else
-        {
-            nene[i] = ene[i] + std::log(nvals[i]);
-        }
-    }
-
-    /* find the maximum value */
-    maxene = nene[0];
-    for (i = 0; i < nlim; i++)
-    {
-        if (nene[i] > maxene)
-        {
-            maxene = nene[i];
-        }
-    }
-
-    /* subtract off the maximum, avoiding overflow */
-    for (i = 0; i < nlim; i++)
-    {
-        nene[i] -= maxene;
-    }
-
-    /* find the denominator */
-    for (i = 0; i < nlim; i++)
-    {
-        *pks += std::exp(nene[i]);
-    }
-
-    /*numerators*/
-    for (i = 0; i < nlim; i++)
-    {
-        p_k[i] = std::exp(nene[i]) / *pks;
-    }
-    sfree(nene);
-}
-
-static int FindMinimum(const real* min_metric, int N)
-{
-
-    real min_val;
-    int  min_nval, nval;
-
-    min_nval = 0;
-    min_val  = min_metric[0];
-
-    for (nval = 0; nval < N; nval++)
-    {
-        if (min_metric[nval] < min_val)
-        {
-            min_val  = min_metric[nval];
-            min_nval = nval;
-        }
-    }
-    return min_nval;
-}
-
-static gmx_bool CheckHistogramRatios(int nhisto, const real* histo, real ratio)
-{
-
-    int      i;
-    real     nmean;
-    gmx_bool bIfFlat;
-
-    nmean = 0;
-    for (i = 0; i < nhisto; i++)
-    {
-        nmean += histo[i];
-    }
-
-    if (nmean == 0)
-    {
-        /* no samples! is bad!*/
-        bIfFlat = FALSE;
-        return bIfFlat;
-    }
-    nmean /= static_cast<real>(nhisto);
-
-    bIfFlat = TRUE;
-    for (i = 0; i < nhisto; i++)
-    {
-        /* make sure that all points are in the ratio < x <  1/ratio range  */
-        if (!((histo[i] / nmean < 1.0 / ratio) && (histo[i] / nmean > ratio)))
-        {
-            bIfFlat = FALSE;
-            break;
-        }
-    }
-    return bIfFlat;
-}
-
-static gmx_bool CheckIfDoneEquilibrating(int nlim, const t_expanded* expand, const df_history_t* dfhist, int64_t step)
-{
-
-    int      i, totalsamples;
-    gmx_bool bDoneEquilibrating = TRUE;
-    gmx_bool bIfFlat;
-
-    /* If we are doing slow growth to get initial values, we haven't finished equilibrating */
-    if (expand->lmc_forced_nstart > 0)
-    {
-        for (i = 0; i < nlim; i++)
-        {
-            if (dfhist->n_at_lam[i]
-                < expand->lmc_forced_nstart) /* we are still doing the initial sweep, so we're
-                                                definitely not done equilibrating*/
-            {
-                bDoneEquilibrating = FALSE;
-                break;
-            }
-        }
-    }
-    else
-    {
-        /* assume we have equilibrated the weights, then check to see if any of the conditions are not met */
-        bDoneEquilibrating = TRUE;
-
-        /* calculate the total number of samples */
-        switch (expand->elmceq)
-        {
-            case elmceqNO:
-                /* We have not equilibrated, and won't, ever. */
-                bDoneEquilibrating = FALSE;
-                break;
-            case elmceqYES:
-                /* we have equilibrated -- we're done */
-                bDoneEquilibrating = TRUE;
-                break;
-            case elmceqSTEPS:
-                /* first, check if we are equilibrating by steps, if we're still under */
-                if (step < expand->equil_steps)
-                {
-                    bDoneEquilibrating = FALSE;
-                }
-                break;
-            case elmceqSAMPLES:
-                totalsamples = 0;
-                for (i = 0; i < nlim; i++)
-                {
-                    totalsamples += dfhist->n_at_lam[i];
-                }
-                if (totalsamples < expand->equil_samples)
-                {
-                    bDoneEquilibrating = FALSE;
-                }
-                break;
-            case elmceqNUMATLAM:
-                for (i = 0; i < nlim; i++)
-                {
-                    if (dfhist->n_at_lam[i]
-                        < expand->equil_n_at_lam) /* we are still doing the initial sweep, so we're
-                                                     definitely not done equilibrating*/
-                    {
-                        bDoneEquilibrating = FALSE;
-                        break;
-                    }
-                }
-                break;
-            case elmceqWLDELTA:
-                if (EWL(expand->elamstats)) /* This check is in readir as well, but
-                                               just to be sure */
-                {
-                    if (dfhist->wl_delta > expand->equil_wl_delta)
-                    {
-                        bDoneEquilibrating = FALSE;
-                    }
-                }
-                break;
-            case elmceqRATIO:
-                /* we can use the flatness as a judge of good weights, as long as
-                   we're not doing minvar, or Wang-Landau.
-                   But turn off for now until we figure out exactly how we do this.
-                 */
-
-                if (!(EWL(expand->elamstats) || expand->elamstats == elamstatsMINVAR))
-                {
-                    /* we want to use flatness -avoiding- the forced-through samples.  Plus, we need
-                       to convert to floats for this histogram function. */
-
-                    real* modhisto;
-                    snew(modhisto, nlim);
-                    for (i = 0; i < nlim; i++)
-                    {
-                        modhisto[i] = 1.0 * (dfhist->n_at_lam[i] - expand->lmc_forced_nstart);
-                    }
-                    bIfFlat = CheckHistogramRatios(nlim, modhisto, expand->equil_ratio);
-                    sfree(modhisto);
-                    if (!bIfFlat)
-                    {
-                        bDoneEquilibrating = FALSE;
-                    }
-                }
-                break;
-            default: bDoneEquilibrating = TRUE; break;
-        }
-    }
-    return bDoneEquilibrating;
-}
-
-static gmx_bool UpdateWeights(int           nlim,
-                              t_expanded*   expand,
-                              df_history_t* dfhist,
-                              int           fep_state,
-                              const real*   scaled_lamee,
-                              const real*   weighted_lamee,
-                              int64_t       step)
-{
-    gmx_bool bSufficientSamples;
-    real     acceptanceWeight;
-    int      i;
-    int      min_nvalm, min_nvalp, maxc;
-    real     omega_m1_0, omega_p1_0;
-    real     zero_sum_weights;
-    real *omegam_array, *weightsm_array, *omegap_array, *weightsp_array, *varm_array, *varp_array,
-            *dwp_array, *dwm_array;
-    real    clam_varm, clam_varp, clam_osum, clam_weightsm, clam_weightsp, clam_minvar;
-    real *  lam_variance, *lam_dg;
-    double* p_k;
-    double  pks = 0;
-
-    /* Future potential todos for this function (see #3848):
-     *  - Update the names in the dhist structure to be clearer. Not done for now since this
-     *    a bugfix update and we are mininizing other code changes.
-     *  - Modularize the code some more.
-     *  - potentially merge with accelerated weight histogram functionality, since it's very similar.
-     */
-    /*  if we have equilibrated the expanded ensemble weights, we are not updating them, so exit now */
-    if (dfhist->bEquil)
-    {
-        return FALSE;
-    }
-
-    if (CheckIfDoneEquilibrating(nlim, expand, dfhist, step))
-    {
-        dfhist->bEquil = TRUE;
-        /* zero out the visited states so we know how many equilibrated states we have
-           from here on out.*/
-        for (i = 0; i < nlim; i++)
-        {
-            dfhist->n_at_lam[i] = 0;
-        }
-        return TRUE;
-    }
-
-    /* If we reached this far, we have not equilibrated yet, keep on
-       going resetting the weights */
-
-    if (EWL(expand->elamstats))
-    {
-        if (expand->elamstats == elamstatsWL) /* Using standard Wang-Landau for weight updates */
-        {
-            dfhist->sum_weights[fep_state] -= dfhist->wl_delta;
-            dfhist->wl_histo[fep_state] += 1.0;
-        }
-        else if (expand->elamstats == elamstatsWWL)
-        /* Using weighted Wang-Landau for weight updates.
-         * Very closly equivalent to accelerated weight histogram approach
-         * applied to expanded ensemble. */
-        {
-            snew(p_k, nlim);
-
-            /* first increment count */
-            GenerateGibbsProbabilities(weighted_lamee, p_k, &pks, 0, nlim - 1);
-            for (i = 0; i < nlim; i++)
-            {
-                dfhist->wl_histo[i] += static_cast<real>(p_k[i]);
-            }
-
-            /* then increment weights (uses count) */
-            pks = 0.0;
-            GenerateWeightedGibbsProbabilities(weighted_lamee, p_k, &pks, nlim, dfhist->wl_histo,
-                                               dfhist->wl_delta);
-
-            for (i = 0; i < nlim; i++)
-            {
-                dfhist->sum_weights[i] -= dfhist->wl_delta * static_cast<real>(p_k[i]);
-            }
-            /* Alternate definition, using logarithms. Shouldn't make very much difference! */
-            /*
-               real di;
-               for (i=0;i<nlim;i++)
-               {
-                di = (real)1.0 + dfhist->wl_delta*(real)p_k[i];
-                dfhist->sum_weights[i] -= log(di);
-               }
-             */
-            sfree(p_k);
-        }
-
-        zero_sum_weights = dfhist->sum_weights[0];
-        for (i = 0; i < nlim; i++)
-        {
-            dfhist->sum_weights[i] -= zero_sum_weights;
-        }
-    }
-
-    if (expand->elamstats == elamstatsBARKER || expand->elamstats == elamstatsMETROPOLIS
-        || expand->elamstats == elamstatsMINVAR)
-    {
-        maxc = 2 * expand->c_range + 1;
-
-        snew(lam_dg, nlim);
-        snew(lam_variance, nlim);
-
-        snew(omegap_array, maxc);
-        snew(weightsp_array, maxc);
-        snew(varp_array, maxc);
-        snew(dwp_array, maxc);
-
-        snew(omegam_array, maxc);
-        snew(weightsm_array, maxc);
-        snew(varm_array, maxc);
-        snew(dwm_array, maxc);
-
-        /* unpack the values of the free energy differences and the
-         * variance in their estimates between nearby lambdas. We will
-         * only actually update 2 of these, the state we are currently
-         * at and the one we end up moving to
-         */
-
-        for (i = 0; i < nlim - 1; i++)
-        { /* only through the second to last */
-            lam_dg[i] = dfhist->sum_dg[i + 1] - dfhist->sum_dg[i];
-            lam_variance[i] =
-                    gmx::square(dfhist->sum_variance[i + 1]) - gmx::square(dfhist->sum_variance[i]);
-        }
-
-        /* accumulate running averages of thermodynamic averages for Bennett Acceptance Ratio-based
-         * estimates of the free energy .
-         * Rather than peforming self-consistent estimation of the free energies at each step,
-         * we keep track of an array of possible different free energies (cnvals),
-         * and we self-consistently choose the best one. The one that leads to a free energy estimate
-         * that is closest to itself is the best estimate of the free energy.  It is essentially a
-         * parallellized version of self-consistent iteration.  maxc is the number of these constants. */
-
-        for (int nval = 0; nval < maxc; nval++)
-        {
-            const real cnval = static_cast<real>(nval - expand->c_range);
-
-            /* Compute acceptance criterion weight to the state below this one for use in averages.
-             * Note we do not have to have just moved from that state to use this free energy
-             * estimate; these are essentially "virtual" moves. */
-
-            if (fep_state > 0)
-            {
-                const auto lambdaEnergyDifference =
-                        cnval - (scaled_lamee[fep_state] - scaled_lamee[fep_state - 1]);
-                acceptanceWeight =
-                        gmx::calculateAcceptanceWeight(expand->elamstats, lambdaEnergyDifference);
-                dfhist->accum_m[fep_state][nval] += acceptanceWeight;
-                dfhist->accum_m2[fep_state][nval] += acceptanceWeight * acceptanceWeight;
-            }
-
-            // Compute acceptance criterion weight to transition to the next state
-            if (fep_state < nlim - 1)
-            {
-                const auto lambdaEnergyDifference =
-                        -cnval + (scaled_lamee[fep_state + 1] - scaled_lamee[fep_state]);
-                acceptanceWeight =
-                        gmx::calculateAcceptanceWeight(expand->elamstats, lambdaEnergyDifference);
-                dfhist->accum_p[fep_state][nval] += acceptanceWeight;
-                dfhist->accum_p2[fep_state][nval] += acceptanceWeight * acceptanceWeight;
-            }
-
-            /* Determination of Metropolis transition and Barker transition weights */
-
-            int numObservationsCurrentState = dfhist->n_at_lam[fep_state];
-            /* determine the number of observations above and below the current state */
-            int numObservationsLowerState = 0;
-            if (fep_state > 0)
-            {
-                numObservationsLowerState = dfhist->n_at_lam[fep_state - 1];
-            }
-            int numObservationsHigherState = 0;
-            if (fep_state < nlim - 1)
-            {
-                numObservationsHigherState = dfhist->n_at_lam[fep_state + 1];
-            }
-
-            /* Calculate the biases for each expanded ensemble state that minimize the total
-             * variance, as implemented in Martinez-Veracoechea and Escobedo,
-             * J. Phys. Chem. B 2008, 112, 8120-8128
-             *
-             * The variance associated with the free energy estimate between two states i and j
-             * is calculated as
-             *     Var(i,j) = {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} / numObservations(i->j)
-             *              + {avg[xi(j->i)^2] / avg[xi(j->i)]^2 - 1} / numObservations(j->i)
-             * where xi(i->j) is the acceptance factor / weight associated with moving from state i to j
-             * As we are calculating the acceptance factor to the neighbors every time we're visiting
-             * a state, numObservations(i->j) == numObservations(i) and numObservations(j->i) == numObservations(j)
-             */
-
-            /* Accumulation of acceptance weight averages between the current state and the
-             * states +1 (p1) and -1 (m1), averaged at current state (0)
-             */
-            real avgAcceptanceCurrentToLower  = 0;
-            real avgAcceptanceCurrentToHigher = 0;
-            /* Accumulation of acceptance weight averages quantities between states 0
-             *  and states +1 and -1, squared
-             */
-            real avgAcceptanceCurrentToLowerSquared  = 0;
-            real avgAcceptanceCurrentToHigherSquared = 0;
-            /* Accumulation of free energy quantities from lower state (m1) to current state (0) and squared */
-            real avgAcceptanceLowerToCurrent        = 0;
-            real avgAcceptanceLowerToCurrentSquared = 0;
-            /* Accumulation of free energy quantities from upper state (p1) to current state (0) and squared */
-            real avgAcceptanceHigherToCurrent        = 0;
-            real avgAcceptanceHigherToCurrentSquared = 0;
-
-            if (numObservationsCurrentState > 0)
-            {
-                avgAcceptanceCurrentToLower = dfhist->accum_m[fep_state][nval] / numObservationsCurrentState;
-                avgAcceptanceCurrentToHigher =
-                        dfhist->accum_p[fep_state][nval] / numObservationsCurrentState;
-                avgAcceptanceCurrentToLowerSquared =
-                        dfhist->accum_m2[fep_state][nval] / numObservationsCurrentState;
-                avgAcceptanceCurrentToHigherSquared =
-                        dfhist->accum_p2[fep_state][nval] / numObservationsCurrentState;
-            }
-
-            if ((fep_state > 0) && (numObservationsLowerState > 0))
-            {
-                avgAcceptanceLowerToCurrent =
-                        dfhist->accum_p[fep_state - 1][nval] / numObservationsLowerState;
-                avgAcceptanceLowerToCurrentSquared =
-                        dfhist->accum_p2[fep_state - 1][nval] / numObservationsLowerState;
-            }
-
-            if ((fep_state < nlim - 1) && (numObservationsHigherState > 0))
-            {
-                avgAcceptanceHigherToCurrent =
-                        dfhist->accum_m[fep_state + 1][nval] / numObservationsHigherState;
-                avgAcceptanceHigherToCurrentSquared =
-                        dfhist->accum_m2[fep_state + 1][nval] / numObservationsHigherState;
-            }
-            /* These are accumulation of positive values (see definition of acceptance functions
-             * above), or of squares of positive values.
-             * We're taking this for granted in the following calculation, so make sure
-             * here that nothing weird happened. Although technically all values should be positive,
-             * because of floating point precisions, they might be numerically zero. */
-            GMX_RELEASE_ASSERT(
-                    avgAcceptanceCurrentToLower >= 0 && avgAcceptanceCurrentToLowerSquared >= 0
-                            && avgAcceptanceCurrentToHigher >= 0
-                            && avgAcceptanceCurrentToHigherSquared >= 0 && avgAcceptanceLowerToCurrent >= 0
-                            && avgAcceptanceLowerToCurrentSquared >= 0 && avgAcceptanceHigherToCurrent >= 0
-                            && avgAcceptanceHigherToCurrentSquared >= 0,
-                    "By definition, the acceptance factors should all be nonnegative.");
-
-            real varianceCurrentToLower   = 0;
-            real varianceCurrentToHigher  = 0;
-            real weightDifferenceToLower  = 0;
-            real weightDifferenceToHigher = 0;
-            real varianceToLower          = 0;
-            real varianceToHigher         = 0;
-
-            if (fep_state > 0)
-            {
-                if (numObservationsCurrentState > 0)
-                {
-                    /* Calculate {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1}
-                     *
-                     * Note that if avg[xi(i->j)] == 0, also avg[xi(i->j)^2] == 0 (since the
-                     * acceptances are all positive!), and hence
-                     *     {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} -> 0  for  avg[xi(i->j)] -> 0
-                     * We're catching that case explicitly to avoid numerical
-                     * problems dividing by zero when the overlap between states is small (#3304)
-                     */
-                    if (avgAcceptanceCurrentToLower > 0)
-                    {
-                        varianceCurrentToLower =
-                                avgAcceptanceCurrentToLowerSquared
-                                        / (avgAcceptanceCurrentToLower * avgAcceptanceCurrentToLower)
-                                - 1.0;
-                    }
-                    if (numObservationsLowerState > 0)
-                    {
-                        /* Calculate {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1}
-                         *
-                         * Note that if avg[xi(i->j)] == 0, also avg[xi(i->j)^2] == 0 (since the
-                         * acceptances are all positive!), and hence
-                         *     {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} -> 0  for  avg[xi(i->j)] -> 0
-                         * We're catching that case explicitly to avoid numerical
-                         * problems dividing by zero when the overlap between states is small (#3304)
-                         */
-                        real varianceLowerToCurrent = 0;
-                        if (avgAcceptanceLowerToCurrent > 0)
-                        {
-                            varianceLowerToCurrent =
-                                    avgAcceptanceLowerToCurrentSquared
-                                            / (avgAcceptanceLowerToCurrent * avgAcceptanceLowerToCurrent)
-                                    - 1.0;
-                        }
-                        /* Free energy difference to the state one state lower */
-                        /* if these either of these quantities are zero, the energies are */
-                        /* way too large for the dynamic range.  We need an alternate guesstimate */
-                        if ((avgAcceptanceCurrentToLower == 0) || (avgAcceptanceLowerToCurrent == 0))
-                        {
-                            weightDifferenceToLower =
-                                    (scaled_lamee[fep_state] - scaled_lamee[fep_state - 1]);
-                        }
-                        else
-                        {
-                            weightDifferenceToLower = (std::log(avgAcceptanceCurrentToLower)
-                                                       - std::log(avgAcceptanceLowerToCurrent))
-                                                      + cnval;
-                        }
-                        /* Variance of the free energy difference to the one state lower */
-                        varianceToLower =
-                                (1.0 / numObservationsCurrentState) * (varianceCurrentToLower)
-                                + (1.0 / numObservationsLowerState) * (varianceLowerToCurrent);
-                    }
-                }
-            }
-
-            if (fep_state < nlim - 1)
-            {
-                if (numObservationsCurrentState > 0)
-                {
-                    /* Calculate {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1}
-                     *
-                     * Note that if avg[xi(i->j)] == 0, also avg[xi(i->j)^2] == 0 (since the
-                     * acceptances are all positive!), and hence
-                     *     {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} -> 0  for  avg[xi(i->j)] -> 0
-                     * We're catching that case explicitly to avoid numerical
-                     * problems dividing by zero when the overlap between states is small (#3304)
-                     */
-
-                    if (avgAcceptanceCurrentToHigher < 0)
-                    {
-                        varianceCurrentToHigher =
-                                avgAcceptanceCurrentToHigherSquared
-                                        / (avgAcceptanceCurrentToHigher * avgAcceptanceCurrentToHigher)
-                                - 1.0;
-                    }
-                    if (numObservationsHigherState > 0)
-                    {
-                        /* Calculate {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1}
-                         *
-                         * Note that if avg[xi(i->j)] == 0, also avg[xi(i->j)^2] == 0 (since the
-                         * acceptances are all positive!), and hence
-                         *     {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} -> 0  for  avg[xi(i->j)] -> 0
-                         * We're catching that case explicitly to avoid numerical
-                         * problems dividing by zero when the overlap between states is small (#3304)
-                         */
-                        real varianceHigherToCurrent = 0;
-                        if (avgAcceptanceHigherToCurrent > 0)
-                        {
-                            varianceHigherToCurrent =
-                                    avgAcceptanceHigherToCurrentSquared
-                                            / (avgAcceptanceHigherToCurrent * avgAcceptanceHigherToCurrent)
-                                    - 1.0;
-                        }
-                        /* Free energy difference to the state one state higher */
-                        /* if these either of these quantities are zero, the energies are */
-                        /* way too large for the dynamic range.  We need an alternate guesstimate */
-                        if ((avgAcceptanceHigherToCurrent == 0) || (avgAcceptanceCurrentToHigher == 0))
-                        {
-                            weightDifferenceToHigher =
-                                    (scaled_lamee[fep_state + 1] - scaled_lamee[fep_state]);
-                        }
-                        else
-                        {
-                            weightDifferenceToHigher = (std::log(avgAcceptanceHigherToCurrent)
-                                                        - std::log(avgAcceptanceCurrentToHigher))
-                                                       + cnval;
-                        }
-                        /* Variance of the free energy difference to the one state higher */
-                        varianceToHigher =
-                                (1.0 / numObservationsHigherState) * (varianceHigherToCurrent)
-                                + (1.0 / numObservationsCurrentState) * (varianceCurrentToHigher);
-                    }
-                }
-            }
-
-            if (numObservationsCurrentState > 0)
-            {
-                omegam_array[nval] = varianceCurrentToLower;
-            }
-            else
-            {
-                omegam_array[nval] = 0;
-            }
-            weightsm_array[nval] = weightDifferenceToLower;
-            varm_array[nval]     = varianceToLower;
-            if (numObservationsLowerState > 0)
-            {
-                dwm_array[nval] =
-                        fabs((cnval + std::log((1.0 * numObservationsCurrentState) / numObservationsLowerState))
-                             - lam_dg[fep_state - 1]);
-            }
-            else
-            {
-                dwm_array[nval] = std::fabs(cnval - lam_dg[fep_state - 1]);
-            }
-
-            if (numObservationsCurrentState > 0)
-            {
-                omegap_array[nval] = varianceCurrentToHigher;
-            }
-            else
-            {
-                omegap_array[nval] = 0;
-            }
-            weightsp_array[nval] = weightDifferenceToHigher;
-            varp_array[nval]     = varianceToHigher;
-            if ((numObservationsHigherState > 0) && (numObservationsCurrentState > 0))
-            {
-                dwp_array[nval] =
-                        fabs((cnval + std::log((1.0 * numObservationsHigherState) / numObservationsCurrentState))
-                             - lam_dg[fep_state]);
-            }
-            else
-            {
-                dwp_array[nval] = std::fabs(cnval - lam_dg[fep_state]);
-            }
-        }
-
-        /* find the free energy estimate closest to the guessed weight's value */
-
-        min_nvalm     = FindMinimum(dwm_array, maxc);
-        omega_m1_0    = omegam_array[min_nvalm];
-        clam_weightsm = weightsm_array[min_nvalm];
-        clam_varm     = varm_array[min_nvalm];
-
-        min_nvalp     = FindMinimum(dwp_array, maxc);
-        omega_p1_0    = omegap_array[min_nvalp];
-        clam_weightsp = weightsp_array[min_nvalp];
-        clam_varp     = varp_array[min_nvalp];
-
-        clam_osum   = omega_m1_0 + omega_p1_0;
-        clam_minvar = 0;
-        if (clam_osum > 0)
-        {
-            clam_minvar = 0.5 * std::log(clam_osum);
-        }
-
-        if (fep_state > 0)
-        {
-            lam_dg[fep_state - 1]       = clam_weightsm;
-            lam_variance[fep_state - 1] = clam_varm;
-        }
-
-        if (fep_state < nlim - 1)
-        {
-            lam_dg[fep_state]       = clam_weightsp;
-            lam_variance[fep_state] = clam_varp;
-        }
-
-        if (expand->elamstats == elamstatsMINVAR)
-        {
-            bSufficientSamples = TRUE;
-            /* make sure the number of samples in each state are all
-             * past a user-specified threshold
-             */
-            for (i = 0; i < nlim; i++)
-            {
-                if (dfhist->n_at_lam[i] < expand->minvarmin)
-                {
-                    bSufficientSamples = FALSE;
-                }
-            }
-            if (bSufficientSamples)
-            {
-                dfhist->sum_minvar[fep_state] = clam_minvar;
-                if (fep_state == 0)
-                {
-                    for (i = 0; i < nlim; i++)
-                    {
-                        dfhist->sum_minvar[i] += (expand->minvar_const - clam_minvar);
-                    }
-                    expand->minvar_const          = clam_minvar;
-                    dfhist->sum_minvar[fep_state] = 0.0;
-                }
-                else
-                {
-                    dfhist->sum_minvar[fep_state] -= expand->minvar_const;
-                }
-            }
-        }
-
-        /* we need to rezero minvar now, since it could change at fep_state = 0 */
-        dfhist->sum_dg[0]       = 0.0;
-        dfhist->sum_variance[0] = 0.0;
-        dfhist->sum_weights[0]  = dfhist->sum_dg[0] + dfhist->sum_minvar[0]; /* should be zero */
-
-        for (i = 1; i < nlim; i++)
-        {
-            dfhist->sum_dg[i] = lam_dg[i - 1] + dfhist->sum_dg[i - 1];
-            dfhist->sum_variance[i] =
-                    std::sqrt(lam_variance[i - 1] + gmx::square(dfhist->sum_variance[i - 1]));
-            dfhist->sum_weights[i] = dfhist->sum_dg[i] + dfhist->sum_minvar[i];
-        }
-
-        sfree(lam_dg);
-        sfree(lam_variance);
-
-        sfree(omegam_array);
-        sfree(weightsm_array);
-        sfree(varm_array);
-        sfree(dwm_array);
-
-        sfree(omegap_array);
-        sfree(weightsp_array);
-        sfree(varp_array);
-        sfree(dwp_array);
-    }
-    return FALSE;
-}
-
-static int ChooseNewLambda(int               nlim,
-                           const t_expanded* expand,
-                           df_history_t*     dfhist,
-                           int               fep_state,
-                           const real*       weighted_lamee,
-                           double*           p_k,
-                           int64_t           seed,
-                           int64_t           step)
-{
-    /* Choose new lambda value, and update transition matrix */
-
-    int                  i, ifep, minfep, maxfep, lamnew, lamtrial, starting_fep_state;
-    real                 r1, r2, de, trialprob, tprob = 0;
-    double *             propose, *accept, *remainder;
-    double               pks;
-    real                 pnorm;
-    gmx::ThreeFry2x64<0> rng(
-            seed, gmx::RandomDomain::ExpandedEnsemble); // We only draw once, so zero bits internal counter is fine
-    gmx::UniformRealDistribution<real> dist;
-
-    starting_fep_state = fep_state;
-    lamnew             = fep_state; /* so that there is a default setting -- stays the same */
-
-    // Don't equilibrate weights when using Plumed
-    if (!plumedswitch || expand->elamstats != elamstatsNO)
-    {
-    if (!EWL(expand->elamstats)) /* ignore equilibrating the weights if using WL */
-    {
-        if ((expand->lmc_forced_nstart > 0) && (dfhist->n_at_lam[nlim - 1] <= expand->lmc_forced_nstart))
-        {
-            /* Use a marching method to run through the lambdas and get preliminary free energy data,
-               before starting 'free' sampling.  We start free sampling when we have enough at each lambda */
-
-            /* if we have enough at this lambda, move on to the next one */
-
-            if (dfhist->n_at_lam[fep_state] == expand->lmc_forced_nstart)
-            {
-                lamnew = fep_state + 1;
-                if (lamnew == nlim) /* whoops, stepped too far! */
-                {
-                    lamnew -= 1;
-                }
-            }
-            else
-            {
-                lamnew = fep_state;
-            }
-            return lamnew;
-        }
-    }
-    }
-
-    snew(propose, nlim);
-    snew(accept, nlim);
-    snew(remainder, nlim);
-
-    for (i = 0; i < expand->lmc_repeats; i++)
-    {
-        rng.restart(step, i);
-        dist.reset();
-
-        for (ifep = 0; ifep < nlim; ifep++)
-        {
-            propose[ifep] = 0;
-            accept[ifep]  = 0;
-        }
-
-        if ((expand->elmcmove == elmcmoveGIBBS) || (expand->elmcmove == elmcmoveMETGIBBS))
-        {
-            /* use the Gibbs sampler, with restricted range */
-            if (expand->gibbsdeltalam < 0)
-            {
-                minfep = 0;
-                maxfep = nlim - 1;
-            }
-            else
-            {
-                minfep = fep_state - expand->gibbsdeltalam;
-                maxfep = fep_state + expand->gibbsdeltalam;
-                if (minfep < 0)
-                {
-                    minfep = 0;
-                }
-                if (maxfep > nlim - 1)
-                {
-                    maxfep = nlim - 1;
-                }
-            }
-
-            GenerateGibbsProbabilities(weighted_lamee, p_k, &pks, minfep, maxfep);
-
-            if (expand->elmcmove == elmcmoveGIBBS)
-            {
-                for (ifep = minfep; ifep <= maxfep; ifep++)
-                {
-                    propose[ifep] = p_k[ifep];
-                    accept[ifep]  = 1.0;
-                }
-                /* Gibbs sampling */
-                r1 = dist(rng);
-                for (lamnew = minfep; lamnew <= maxfep; lamnew++)
-                {
-                    if (r1 <= p_k[lamnew])
-                    {
-                        break;
-                    }
-                    r1 -= p_k[lamnew];
-                }
-            }
-            else if (expand->elmcmove == elmcmoveMETGIBBS)
-            {
-
-                /* Metropolized Gibbs sampling */
-                for (ifep = minfep; ifep <= maxfep; ifep++)
-                {
-                    remainder[ifep] = 1 - p_k[ifep];
-                }
-
-                /* find the proposal probabilities */
-
-                if (remainder[fep_state] == 0)
-                {
-                    /* only the current state has any probability */
-                    /* we have to stay at the current state */
-                    lamnew = fep_state;
-                }
-                else
-                {
-                    for (ifep = minfep; ifep <= maxfep; ifep++)
-                    {
-                        if (ifep != fep_state)
-                        {
-                            propose[ifep] = p_k[ifep] / remainder[fep_state];
-                        }
-                        else
-                        {
-                            propose[ifep] = 0;
-                        }
-                    }
-
-                    r1 = dist(rng);
-                    for (lamtrial = minfep; lamtrial <= maxfep; lamtrial++)
-                    {
-                        pnorm = p_k[lamtrial] / remainder[fep_state];
-                        if (lamtrial != fep_state)
-                        {
-                            if (r1 <= pnorm)
-                            {
-                                break;
-                            }
-                            r1 -= pnorm;
-                        }
-                    }
-
-                    /* we have now selected lamtrial according to p(lamtrial)/1-p(fep_state) */
-                    tprob = 1.0;
-                    /* trial probability is min{1,\frac{1 - p(old)}{1-p(new)} MRS 1/8/2008 */
-                    trialprob = (remainder[fep_state]) / (remainder[lamtrial]);
-                    if (trialprob < tprob)
-                    {
-                        tprob = trialprob;
-                    }
-                    r2 = dist(rng);
-                    if (r2 < tprob)
-                    {
-                        lamnew = lamtrial;
-                    }
-                    else
-                    {
-                        lamnew = fep_state;
-                    }
-                }
-
-                /* now figure out the acceptance probability for each */
-                for (ifep = minfep; ifep <= maxfep; ifep++)
-                {
-                    tprob = 1.0;
-                    if (remainder[ifep] != 0)
-                    {
-                        trialprob = (remainder[fep_state]) / (remainder[ifep]);
-                    }
-                    else
-                    {
-                        trialprob = 1.0; /* this state is the only choice! */
-                    }
-                    if (trialprob < tprob)
-                    {
-                        tprob = trialprob;
-                    }
-                    /* probability for fep_state=0, but that's fine, it's never proposed! */
-                    accept[ifep] = tprob;
-                }
-            }
-
-            if (lamnew > maxfep)
-            {
-                /* it's possible some rounding is failing */
-                if (gmx_within_tol(remainder[fep_state], 0, 50 * GMX_DOUBLE_EPS))
-                {
-                    /* numerical rounding error -- no state other than the original has weight */
-                    lamnew = fep_state;
-                }
-                else
-                {
-                    /* probably not a numerical issue */
-                    int   loc    = 0;
-                    int   nerror = 200 + (maxfep - minfep + 1) * 60;
-                    char* errorstr;
-                    snew(errorstr, nerror);
-                    /* if its greater than maxfep, then something went wrong -- probably underflow
-                       in the calculation of sum weights. Generated detailed info for failure */
-                    loc += sprintf(
-                            errorstr,
-                            "Something wrong in choosing new lambda state with a Gibbs move -- "
-                            "probably underflow in weight determination.\nDenominator is: "
-                            "%3d%17.10e\n  i                dE        numerator          weights\n",
-                            0, pks);
-                    for (ifep = minfep; ifep <= maxfep; ifep++)
-                    {
-                        loc += sprintf(&errorstr[loc], "%3d %17.10e%17.10e%17.10e\n", ifep,
-                                       weighted_lamee[ifep], p_k[ifep], dfhist->sum_weights[ifep]);
-                    }
-                    gmx_fatal(FARGS, "%s", errorstr);
-                }
-            }
-        }
-        else if ((expand->elmcmove == elmcmoveMETROPOLIS) || (expand->elmcmove == elmcmoveBARKER))
-        {
-            /* use the metropolis sampler with trial +/- 1 */
-            r1 = dist(rng);
-            if (r1 < 0.5)
-            {
-                if (fep_state == 0)
-                {
-                    lamtrial = fep_state;
-                }
-                else
-                {
-                    lamtrial = fep_state - 1;
-                }
-            }
-            else
-            {
-                if (fep_state == nlim - 1)
-                {
-                    lamtrial = fep_state;
-                }
-                else
-                {
-                    lamtrial = fep_state + 1;
-                }
-            }
-
-            de = weighted_lamee[lamtrial] - weighted_lamee[fep_state];
-            if (expand->elmcmove == elmcmoveMETROPOLIS)
-            {
-                tprob = 1.0;
-                if (de < 0)
-                {
-                    tprob = std::exp(de);
-                }
-                propose[fep_state] = 0;
-                propose[lamtrial]  = 1.0; /* note that this overwrites the above line if fep_state = ntrial, which only occurs at the ends */
-                accept[fep_state] =
-                        1.0; /* doesn't actually matter, never proposed unless fep_state = ntrial, in which case it's 1.0 anyway */
-                accept[lamtrial] = tprob;
-            }
-            else if (expand->elmcmove == elmcmoveBARKER)
-            {
-                if (de > 0) /* Numerically stable version */
-                {
-                    tprob = 1.0 / (1.0 + std::exp(-de));
-                }
-                else if (de < 0)
-                {
-                    tprob = std::exp(de) / (std::exp(de) + 1.0);
-                }
-                propose[fep_state] = (1 - tprob);
-                propose[lamtrial] +=
-                        tprob; /* we add, to account for the fact that at the end, they might be the same point */
-                accept[fep_state] = 1.0;
-                accept[lamtrial]  = 1.0;
-            }
-
-            r2 = dist(rng);
-            if (r2 < tprob)
-            {
-                lamnew = lamtrial;
-            }
-            else
-            {
-                lamnew = fep_state;
-            }
-        }
-
-        for (ifep = 0; ifep < nlim; ifep++)
-        {
-            dfhist->Tij[fep_state][ifep] += propose[ifep] * accept[ifep];
-            dfhist->Tij[fep_state][fep_state] += propose[ifep] * (1.0 - accept[ifep]);
-        }
-        fep_state = lamnew;
-    }
-
-    dfhist->Tij_empirical[starting_fep_state][lamnew] += 1.0;
-
-    sfree(propose);
-    sfree(accept);
-    sfree(remainder);
-
-    return lamnew;
-}
-
-/* print out the weights to the log, along with current state */
-void PrintFreeEnergyInfoToFile(FILE*               outfile,
-                               const t_lambda*     fep,
-                               const t_expanded*   expand,
-                               const t_simtemp*    simtemp,
-                               const df_history_t* dfhist,
-                               int                 fep_state,
-                               int                 frequency,
-                               int64_t             step)
-{
-    int         nlim, i, ifep, jfep;
-    real        dw, dg, dv, Tprint;
-    const char* print_names[efptNR] = { " FEPL", "MassL", "CoulL",   " VdwL",
-                                        "BondL", "RestT", "Temp.(K)" };
-    gmx_bool    bSimTemp            = FALSE;
-
-    nlim = fep->n_lambda;
-    if (simtemp != nullptr)
-    {
-        bSimTemp = TRUE;
-    }
-
-    if (step % frequency == 0)
-    {
-        fprintf(outfile, "             MC-lambda information\n");
-        if (EWL(expand->elamstats) && (!(dfhist->bEquil)))
-        {
-            fprintf(outfile, "  Wang-Landau incrementor is: %11.5g\n", dfhist->wl_delta);
-        }
-        fprintf(outfile, "  N");
-        for (i = 0; i < efptNR; i++)
-        {
-            if (fep->separate_dvdl[i])
-            {
-                fprintf(outfile, "%7s", print_names[i]);
-            }
-            else if ((i == efptTEMPERATURE) && bSimTemp)
-            {
-                fprintf(outfile, "%10s", print_names[i]); /* more space for temperature formats */
-            }
-        }
-        fprintf(outfile, "    Count   ");
-        if (expand->elamstats == elamstatsMINVAR)
-        {
-            fprintf(outfile, "W(in kT)   G(in kT)  dG(in kT)  dV(in kT)\n");
-        }
-        else
-        {
-            fprintf(outfile, "G(in kT)  dG(in kT)\n");
-        }
-        for (ifep = 0; ifep < nlim; ifep++)
-        {
-            if (ifep == nlim - 1)
-            {
-                dw = 0.0;
-                dg = 0.0;
-                dv = 0.0;
-            }
-            else
-            {
-                dw = dfhist->sum_weights[ifep + 1] - dfhist->sum_weights[ifep];
-                dg = dfhist->sum_dg[ifep + 1] - dfhist->sum_dg[ifep];
-                dv = std::sqrt(gmx::square(dfhist->sum_variance[ifep + 1])
-                               - gmx::square(dfhist->sum_variance[ifep]));
-            }
-            fprintf(outfile, "%3d", (ifep + 1));
-            for (i = 0; i < efptNR; i++)
-            {
-                if (fep->separate_dvdl[i])
-                {
-                    fprintf(outfile, "%7.3f", fep->all_lambda[i][ifep]);
-                }
-                else if (i == efptTEMPERATURE && bSimTemp)
-                {
-                    fprintf(outfile, "%9.3f", simtemp->temperatures[ifep]);
-                }
-            }
-            if (EWL(expand->elamstats)
-                && (!(dfhist->bEquil))) /* if performing WL and still haven't equilibrated */
-            {
-                if (expand->elamstats == elamstatsWL)
-                {
-                    fprintf(outfile, " %8d", static_cast<int>(dfhist->wl_histo[ifep]));
-                }
-                else
-                {
-                    fprintf(outfile, " %8.3f", dfhist->wl_histo[ifep]);
-                }
-            }
-            else /* we have equilibrated weights */
-            {
-                fprintf(outfile, " %8d", dfhist->n_at_lam[ifep]);
-            }
-            if (expand->elamstats == elamstatsMINVAR)
-            {
-                fprintf(outfile, " %10.5f %10.5f %10.5f %10.5f", dfhist->sum_weights[ifep],
-                        dfhist->sum_dg[ifep], dg, dv);
-            }
-            else
-            {
-                fprintf(outfile, " %10.5f %10.5f", dfhist->sum_weights[ifep], dw);
-            }
-            if (ifep == fep_state)
-            {
-                fprintf(outfile, " <<\n");
-            }
-            else
-            {
-                fprintf(outfile, "   \n");
-            }
-        }
-        fprintf(outfile, "\n");
-
-        if ((step % expand->nstTij == 0) && (expand->nstTij > 0) && (step > 0))
-        {
-            fprintf(outfile, "                     Transition Matrix\n");
-            for (ifep = 0; ifep < nlim; ifep++)
-            {
-                fprintf(outfile, "%12d", (ifep + 1));
-            }
-            fprintf(outfile, "\n");
-            for (ifep = 0; ifep < nlim; ifep++)
-            {
-                for (jfep = 0; jfep < nlim; jfep++)
-                {
-                    if (dfhist->n_at_lam[ifep] > 0)
-                    {
-                        if (expand->bSymmetrizedTMatrix)
-                        {
-                            Tprint = (dfhist->Tij[ifep][jfep] + dfhist->Tij[jfep][ifep])
-                                     / (dfhist->n_at_lam[ifep] + dfhist->n_at_lam[jfep]);
-                        }
-                        else
-                        {
-                            Tprint = (dfhist->Tij[ifep][jfep]) / (dfhist->n_at_lam[ifep]);
-                        }
-                    }
-                    else
-                    {
-                        Tprint = 0.0;
-                    }
-                    fprintf(outfile, "%12.8f", Tprint);
-                }
-                fprintf(outfile, "%3d\n", (ifep + 1));
-            }
-
-            fprintf(outfile, "                  Empirical Transition Matrix\n");
-            for (ifep = 0; ifep < nlim; ifep++)
-            {
-                fprintf(outfile, "%12d", (ifep + 1));
-            }
-            fprintf(outfile, "\n");
-            for (ifep = 0; ifep < nlim; ifep++)
-            {
-                for (jfep = 0; jfep < nlim; jfep++)
-                {
-                    if (dfhist->n_at_lam[ifep] > 0)
-                    {
-                        if (expand->bSymmetrizedTMatrix)
-                        {
-                            Tprint = (dfhist->Tij_empirical[ifep][jfep] + dfhist->Tij_empirical[jfep][ifep])
-                                     / (dfhist->n_at_lam[ifep] + dfhist->n_at_lam[jfep]);
-                        }
-                        else
-                        {
-                            Tprint = dfhist->Tij_empirical[ifep][jfep] / (dfhist->n_at_lam[ifep]);
-                        }
-                    }
-                    else
-                    {
-                        Tprint = 0.0;
-                    }
-                    fprintf(outfile, "%12.8f", Tprint);
-                }
-                fprintf(outfile, "%3d\n", (ifep + 1));
-            }
-        }
-    }
-}
-
-int ExpandedEnsembleDynamics(FILE*                 log,
-                             const t_inputrec*     ir,
-                             const gmx_enerdata_t* enerd,
-                             t_state*              state,
-                             t_extmass*            MassQ,
-                             int                   fep_state,
-                             df_history_t*         dfhist,
-                             int64_t               step,
-                             rvec*                 v,
-                             const t_mdatoms*      mdatoms,
-                             real*                 realFepState)
-/* Note that the state variable is only needed for simulated tempering, not
-   Hamiltonian expanded ensemble.  May be able to remove it after integrator refactoring. */
-{
-    real *      pfep_lamee, *scaled_lamee, *weighted_lamee;
-    double*     p_k;
-    int         i, nlim, lamnew, totalsamples;
-    real        oneovert, maxscaled = 0, maxweighted = 0;
-    t_expanded* expand;
-    t_simtemp*  simtemp;
-    gmx_bool    bIfReset, bSwitchtoOneOverT, bDoneEquilibrating = FALSE;
-
-    expand  = ir->expandedvals;
-    simtemp = ir->simtempvals;
-    nlim    = ir->fepvals->n_lambda;
-
-    snew(scaled_lamee, nlim);
-    snew(weighted_lamee, nlim);
-    snew(pfep_lamee, nlim);
-    snew(p_k, nlim);
-
-    /* update the count at the current lambda*/
-    dfhist->n_at_lam[fep_state]++;
-
-    /* need to calculate the PV term somewhere, but not needed here? Not until there's a lambda
-       state that's pressure controlled.*/
-    /*
-       pVTerm = 0;
-       where does this PV term go?
-       for (i=0;i<nlim;i++)
-       {
-       fep_lamee[i] += pVTerm;
-       }
-     */
-
-    /* determine the minimum value to avoid overflow.  Probably a better way to do this */
-    /* we don't need to include the pressure term, since the volume is the same between the two.
-       is there some term we are neglecting, however? */
-
-    if (ir->efep != efepNO)
-    {
-        for (i = 0; i < nlim; i++)
-        {
-            if (ir->bSimTemp)
-            {
-                /* Note -- this assumes no mass changes, since kinetic energy is not added  . . . */
-                scaled_lamee[i] = enerd->foreignLambdaTerms.deltaH(i) / (simtemp->temperatures[i] * BOLTZ)
-                                  + enerd->term[F_EPOT]
-                                            * (1.0 / (simtemp->temperatures[i])
-                                               - 1.0 / (simtemp->temperatures[fep_state]))
-                                            / BOLTZ;
-            }
-            else
-            {
-                scaled_lamee[i] = enerd->foreignLambdaTerms.deltaH(i) / (expand->mc_temp * BOLTZ);
-                /* mc_temp is currently set to the system reft unless otherwise defined */
-            }
-
-            /* save these energies for printing, so they don't get overwritten by the next step */
-            /* they aren't overwritten in the non-free energy case, but we always print with these
-               for simplicity */
-        }
-    }
-    else
-    {
-        if (ir->bSimTemp)
-        {
-            for (i = 0; i < nlim; i++)
-            {
-                scaled_lamee[i] =
-                        enerd->term[F_EPOT]
-                        * (1.0 / simtemp->temperatures[i] - 1.0 / simtemp->temperatures[fep_state]) / BOLTZ;
-            }
-        }
-    }
-
-    for (i = 0; i < nlim; i++)
-    {
-        pfep_lamee[i] = scaled_lamee[i];
-
-        weighted_lamee[i] = dfhist->sum_weights[i] - scaled_lamee[i];
-        if (i == 0)
-        {
-            maxscaled   = scaled_lamee[i];
-            maxweighted = weighted_lamee[i];
-        }
-        else
-        {
-            if (scaled_lamee[i] > maxscaled)
-            {
-                maxscaled = scaled_lamee[i];
-            }
-            if (weighted_lamee[i] > maxweighted)
-            {
-                maxweighted = weighted_lamee[i];
-            }
-        }
-    }
-
-    for (i = 0; i < nlim; i++)
-    {
-        scaled_lamee[i] -= maxscaled;
-        weighted_lamee[i] -= maxweighted;
-    }
-
-    if (plumedswitch && expand->elamstats == elamstatsNO)
-    {
-        // Update weights at all lambda states with current values from Plumed.
-        // For acceptance criterion, expanded ensemble is expecting the weight at
-        // lambda i=0 to be zero.
-        real zeroBias = 0;
-        for (i = 0; i < nlim; i++)
-        {
-            *realFepState = i;
-            real bias = 0;
-            plumed_cmd(plumedmain, "prepareCalc", nullptr);
-            plumed_cmd(plumedmain, "performCalcNoForces", nullptr);
-            plumed_cmd(plumedmain, "getBias", &bias);
-            bias /= expand->mc_temp * BOLTZ;
-            if (i == 0)
-            {
-                zeroBias = bias;
-            }
-            dfhist->sum_weights[i] = -bias + zeroBias;
-        }
-        *realFepState = fep_state;
-    }
-    else // Don't update weights using different method when Plumed is active
-    {
-    /* update weights - we decide whether or not to actually do this inside */
-
-    bDoneEquilibrating =
-            UpdateWeights(nlim, expand, dfhist, fep_state, scaled_lamee, weighted_lamee, step);
-    if (bDoneEquilibrating)
-    {
-        if (log)
-        {
-            fprintf(log, "\nStep %" PRId64 ": Weights have equilibrated, using criteria: %s\n",
-                    step, elmceq_names[expand->elmceq]);
-        }
-    }
-    }
-
-    // Accept / reject is handled by GROMACS (possibly with Plumed weights).
-    lamnew = ChooseNewLambda(nlim, expand, dfhist, fep_state, weighted_lamee, p_k,
-                             ir->expandedvals->lmc_seed, step);
-    /* if using simulated tempering, we need to adjust the temperatures */
-    if (ir->bSimTemp && (lamnew != fep_state)) /* only need to change the temperatures if we change the state */
-    {
-        int   i, j, n, d;
-        real* buf_ngtc;
-        real  told;
-        int   nstart, nend, gt;
-
-        snew(buf_ngtc, ir->opts.ngtc);
-
-        for (i = 0; i < ir->opts.ngtc; i++)
-        {
-            if (ir->opts.ref_t[i] > 0)
-            {
-                told              = ir->opts.ref_t[i];
-                ir->opts.ref_t[i] = simtemp->temperatures[lamnew];
-                buf_ngtc[i]       = std::sqrt(ir->opts.ref_t[i] / told); /* using the buffer as temperature scaling */
-            }
-        }
-
-        /* we don't need to manipulate the ekind information, as it isn't due to be reset until the next step anyway */
-
-        nstart = 0;
-        nend   = mdatoms->homenr;
-        for (n = nstart; n < nend; n++)
-        {
-            gt = 0;
-            if (mdatoms->cTC)
-            {
-                gt = mdatoms->cTC[n];
-            }
-            for (d = 0; d < DIM; d++)
-            {
-                v[n][d] *= buf_ngtc[gt];
-            }
-        }
-
-        if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir))
-        {
-            /* we need to recalculate the masses if the temperature has changed */
-            init_npt_masses(ir, state, MassQ, FALSE);
-            for (i = 0; i < state->nnhpres; i++)
-            {
-                for (j = 0; j < ir->opts.nhchainlength; j++)
-                {
-                    state->nhpres_vxi[i + j] *= buf_ngtc[i];
-                }
-            }
-            for (i = 0; i < ir->opts.ngtc; i++)
-            {
-                for (j = 0; j < ir->opts.nhchainlength; j++)
-                {
-                    state->nosehoover_vxi[i + j] *= buf_ngtc[i];
-                }
-            }
-        }
-        sfree(buf_ngtc);
-    }
-
-    /* now check on the Wang-Landau updating critera */
-
-    if (EWL(expand->elamstats))
-    {
-        bSwitchtoOneOverT = FALSE;
-        if (expand->bWLoneovert)
-        {
-            totalsamples = 0;
-            for (i = 0; i < nlim; i++)
-            {
-                totalsamples += dfhist->n_at_lam[i];
-            }
-            oneovert = (1.0 * nlim) / totalsamples;
-            /* oneovert has decreasd by a bit since last time, so we actually make sure its within one of this number */
-            /* switch to 1/t incrementing when wl_delta has decreased at least once, and wl_delta is now less than 1/t */
-            if ((dfhist->wl_delta <= ((totalsamples) / (totalsamples - 1.00001)) * oneovert)
-                && (dfhist->wl_delta < expand->init_wl_delta))
-            {
-                bSwitchtoOneOverT = TRUE;
-            }
-        }
-        if (bSwitchtoOneOverT)
-        {
-            dfhist->wl_delta =
-                    oneovert; /* now we reduce by this each time, instead of only at flatness */
-        }
-        else
-        {
-            bIfReset = CheckHistogramRatios(nlim, dfhist->wl_histo, expand->wl_ratio);
-            if (bIfReset)
-            {
-                for (i = 0; i < nlim; i++)
-                {
-                    dfhist->wl_histo[i] = 0;
-                }
-                dfhist->wl_delta *= expand->wl_scale;
-                if (log)
-                {
-                    fprintf(log, "\nStep %d: weights are now:", static_cast<int>(step));
-                    for (i = 0; i < nlim; i++)
-                    {
-                        fprintf(log, " %.5f", dfhist->sum_weights[i]);
-                    }
-                    fprintf(log, "\n");
-                }
-            }
-        }
-    }
-    sfree(pfep_lamee);
-    sfree(scaled_lamee);
-    sfree(weighted_lamee);
-    sfree(p_k);
-
-    return lamnew;
-}
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.cpp.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.cpp.preplumed
deleted file mode 100644
index d48016a4d6..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.cpp.preplumed
+++ /dev/null
@@ -1,1580 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012-2018, The GROMACS development team.
- * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include "expanded.h"
-
-#include <cmath>
-#include <cstdio>
-
-#include <algorithm>
-
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/fileio/confio.h"
-#include "gromacs/fileio/gmxfio.h"
-#include "gromacs/fileio/xtcio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/orires.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/calcmu.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdtypes/enerdata.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/random/threefry.h"
-#include "gromacs/random/uniformrealdistribution.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/smalloc.h"
-
-#include "expanded_internal.h"
-
-static void init_df_history_weights(df_history_t* dfhist, const t_expanded* expand, int nlim)
-{
-    int i;
-    dfhist->wl_delta = expand->init_wl_delta;
-    for (i = 0; i < nlim; i++)
-    {
-        dfhist->sum_weights[i] = expand->init_lambda_weights[i];
-        dfhist->sum_dg[i]      = expand->init_lambda_weights[i];
-    }
-}
-
-/* Eventually should contain all the functions needed to initialize expanded ensemble
-   before the md loop starts */
-void init_expanded_ensemble(gmx_bool bStateFromCP, const t_inputrec* ir, df_history_t* dfhist)
-{
-    if (!bStateFromCP)
-    {
-        init_df_history_weights(dfhist, ir->expandedvals, ir->fepvals->n_lambda);
-    }
-}
-
-static void GenerateGibbsProbabilities(const real* ene, double* p_k, double* pks, int minfep, int maxfep)
-{
-
-    int  i;
-    real maxene;
-
-    *pks   = 0.0;
-    maxene = ene[minfep];
-    /* find the maximum value */
-    for (i = minfep; i <= maxfep; i++)
-    {
-        if (ene[i] > maxene)
-        {
-            maxene = ene[i];
-        }
-    }
-    /* find the denominator */
-    for (i = minfep; i <= maxfep; i++)
-    {
-        *pks += std::exp(ene[i] - maxene);
-    }
-    /*numerators*/
-    for (i = minfep; i <= maxfep; i++)
-    {
-        p_k[i] = std::exp(ene[i] - maxene) / *pks;
-    }
-}
-
-static void
-GenerateWeightedGibbsProbabilities(const real* ene, double* p_k, double* pks, int nlim, real* nvals, real delta)
-{
-
-    int   i;
-    real  maxene;
-    real* nene;
-    *pks = 0.0;
-
-    snew(nene, nlim);
-    for (i = 0; i < nlim; i++)
-    {
-        if (nvals[i] == 0)
-        {
-            /* add the delta, since we need to make sure it's greater than zero, and
-               we need a non-arbitrary number? */
-            nene[i] = ene[i] + std::log(nvals[i] + delta);
-        }
-        else
-        {
-            nene[i] = ene[i] + std::log(nvals[i]);
-        }
-    }
-
-    /* find the maximum value */
-    maxene = nene[0];
-    for (i = 0; i < nlim; i++)
-    {
-        if (nene[i] > maxene)
-        {
-            maxene = nene[i];
-        }
-    }
-
-    /* subtract off the maximum, avoiding overflow */
-    for (i = 0; i < nlim; i++)
-    {
-        nene[i] -= maxene;
-    }
-
-    /* find the denominator */
-    for (i = 0; i < nlim; i++)
-    {
-        *pks += std::exp(nene[i]);
-    }
-
-    /*numerators*/
-    for (i = 0; i < nlim; i++)
-    {
-        p_k[i] = std::exp(nene[i]) / *pks;
-    }
-    sfree(nene);
-}
-
-static int FindMinimum(const real* min_metric, int N)
-{
-
-    real min_val;
-    int  min_nval, nval;
-
-    min_nval = 0;
-    min_val  = min_metric[0];
-
-    for (nval = 0; nval < N; nval++)
-    {
-        if (min_metric[nval] < min_val)
-        {
-            min_val  = min_metric[nval];
-            min_nval = nval;
-        }
-    }
-    return min_nval;
-}
-
-static gmx_bool CheckHistogramRatios(int nhisto, const real* histo, real ratio)
-{
-
-    int      i;
-    real     nmean;
-    gmx_bool bIfFlat;
-
-    nmean = 0;
-    for (i = 0; i < nhisto; i++)
-    {
-        nmean += histo[i];
-    }
-
-    if (nmean == 0)
-    {
-        /* no samples! is bad!*/
-        bIfFlat = FALSE;
-        return bIfFlat;
-    }
-    nmean /= static_cast<real>(nhisto);
-
-    bIfFlat = TRUE;
-    for (i = 0; i < nhisto; i++)
-    {
-        /* make sure that all points are in the ratio < x <  1/ratio range  */
-        if (!((histo[i] / nmean < 1.0 / ratio) && (histo[i] / nmean > ratio)))
-        {
-            bIfFlat = FALSE;
-            break;
-        }
-    }
-    return bIfFlat;
-}
-
-static gmx_bool CheckIfDoneEquilibrating(int nlim, const t_expanded* expand, const df_history_t* dfhist, int64_t step)
-{
-
-    int      i, totalsamples;
-    gmx_bool bDoneEquilibrating = TRUE;
-    gmx_bool bIfFlat;
-
-    /* If we are doing slow growth to get initial values, we haven't finished equilibrating */
-    if (expand->lmc_forced_nstart > 0)
-    {
-        for (i = 0; i < nlim; i++)
-        {
-            if (dfhist->n_at_lam[i]
-                < expand->lmc_forced_nstart) /* we are still doing the initial sweep, so we're
-                                                definitely not done equilibrating*/
-            {
-                bDoneEquilibrating = FALSE;
-                break;
-            }
-        }
-    }
-    else
-    {
-        /* assume we have equilibrated the weights, then check to see if any of the conditions are not met */
-        bDoneEquilibrating = TRUE;
-
-        /* calculate the total number of samples */
-        switch (expand->elmceq)
-        {
-            case elmceqNO:
-                /* We have not equilibrated, and won't, ever. */
-                bDoneEquilibrating = FALSE;
-                break;
-            case elmceqYES:
-                /* we have equilibrated -- we're done */
-                bDoneEquilibrating = TRUE;
-                break;
-            case elmceqSTEPS:
-                /* first, check if we are equilibrating by steps, if we're still under */
-                if (step < expand->equil_steps)
-                {
-                    bDoneEquilibrating = FALSE;
-                }
-                break;
-            case elmceqSAMPLES:
-                totalsamples = 0;
-                for (i = 0; i < nlim; i++)
-                {
-                    totalsamples += dfhist->n_at_lam[i];
-                }
-                if (totalsamples < expand->equil_samples)
-                {
-                    bDoneEquilibrating = FALSE;
-                }
-                break;
-            case elmceqNUMATLAM:
-                for (i = 0; i < nlim; i++)
-                {
-                    if (dfhist->n_at_lam[i]
-                        < expand->equil_n_at_lam) /* we are still doing the initial sweep, so we're
-                                                     definitely not done equilibrating*/
-                    {
-                        bDoneEquilibrating = FALSE;
-                        break;
-                    }
-                }
-                break;
-            case elmceqWLDELTA:
-                if (EWL(expand->elamstats)) /* This check is in readir as well, but
-                                               just to be sure */
-                {
-                    if (dfhist->wl_delta > expand->equil_wl_delta)
-                    {
-                        bDoneEquilibrating = FALSE;
-                    }
-                }
-                break;
-            case elmceqRATIO:
-                /* we can use the flatness as a judge of good weights, as long as
-                   we're not doing minvar, or Wang-Landau.
-                   But turn off for now until we figure out exactly how we do this.
-                 */
-
-                if (!(EWL(expand->elamstats) || expand->elamstats == elamstatsMINVAR))
-                {
-                    /* we want to use flatness -avoiding- the forced-through samples.  Plus, we need
-                       to convert to floats for this histogram function. */
-
-                    real* modhisto;
-                    snew(modhisto, nlim);
-                    for (i = 0; i < nlim; i++)
-                    {
-                        modhisto[i] = 1.0 * (dfhist->n_at_lam[i] - expand->lmc_forced_nstart);
-                    }
-                    bIfFlat = CheckHistogramRatios(nlim, modhisto, expand->equil_ratio);
-                    sfree(modhisto);
-                    if (!bIfFlat)
-                    {
-                        bDoneEquilibrating = FALSE;
-                    }
-                }
-                break;
-            default: bDoneEquilibrating = TRUE; break;
-        }
-    }
-    return bDoneEquilibrating;
-}
-
-static gmx_bool UpdateWeights(int           nlim,
-                              t_expanded*   expand,
-                              df_history_t* dfhist,
-                              int           fep_state,
-                              const real*   scaled_lamee,
-                              const real*   weighted_lamee,
-                              int64_t       step)
-{
-    gmx_bool bSufficientSamples;
-    real     acceptanceWeight;
-    int      i;
-    int      min_nvalm, min_nvalp, maxc;
-    real     omega_m1_0, omega_p1_0;
-    real     zero_sum_weights;
-    real *omegam_array, *weightsm_array, *omegap_array, *weightsp_array, *varm_array, *varp_array,
-            *dwp_array, *dwm_array;
-    real    clam_varm, clam_varp, clam_osum, clam_weightsm, clam_weightsp, clam_minvar;
-    real *  lam_variance, *lam_dg;
-    double* p_k;
-    double  pks = 0;
-
-    /* Future potential todos for this function (see #3848):
-     *  - Update the names in the dhist structure to be clearer. Not done for now since this
-     *    a bugfix update and we are mininizing other code changes.
-     *  - Modularize the code some more.
-     *  - potentially merge with accelerated weight histogram functionality, since it's very similar.
-     */
-    /*  if we have equilibrated the expanded ensemble weights, we are not updating them, so exit now */
-    if (dfhist->bEquil)
-    {
-        return FALSE;
-    }
-
-    if (CheckIfDoneEquilibrating(nlim, expand, dfhist, step))
-    {
-        dfhist->bEquil = TRUE;
-        /* zero out the visited states so we know how many equilibrated states we have
-           from here on out.*/
-        for (i = 0; i < nlim; i++)
-        {
-            dfhist->n_at_lam[i] = 0;
-        }
-        return TRUE;
-    }
-
-    /* If we reached this far, we have not equilibrated yet, keep on
-       going resetting the weights */
-
-    if (EWL(expand->elamstats))
-    {
-        if (expand->elamstats == elamstatsWL) /* Using standard Wang-Landau for weight updates */
-        {
-            dfhist->sum_weights[fep_state] -= dfhist->wl_delta;
-            dfhist->wl_histo[fep_state] += 1.0;
-        }
-        else if (expand->elamstats == elamstatsWWL)
-        /* Using weighted Wang-Landau for weight updates.
-         * Very closly equivalent to accelerated weight histogram approach
-         * applied to expanded ensemble. */
-        {
-            snew(p_k, nlim);
-
-            /* first increment count */
-            GenerateGibbsProbabilities(weighted_lamee, p_k, &pks, 0, nlim - 1);
-            for (i = 0; i < nlim; i++)
-            {
-                dfhist->wl_histo[i] += static_cast<real>(p_k[i]);
-            }
-
-            /* then increment weights (uses count) */
-            pks = 0.0;
-            GenerateWeightedGibbsProbabilities(weighted_lamee, p_k, &pks, nlim, dfhist->wl_histo,
-                                               dfhist->wl_delta);
-
-            for (i = 0; i < nlim; i++)
-            {
-                dfhist->sum_weights[i] -= dfhist->wl_delta * static_cast<real>(p_k[i]);
-            }
-            /* Alternate definition, using logarithms. Shouldn't make very much difference! */
-            /*
-               real di;
-               for (i=0;i<nlim;i++)
-               {
-                di = (real)1.0 + dfhist->wl_delta*(real)p_k[i];
-                dfhist->sum_weights[i] -= log(di);
-               }
-             */
-            sfree(p_k);
-        }
-
-        zero_sum_weights = dfhist->sum_weights[0];
-        for (i = 0; i < nlim; i++)
-        {
-            dfhist->sum_weights[i] -= zero_sum_weights;
-        }
-    }
-
-    if (expand->elamstats == elamstatsBARKER || expand->elamstats == elamstatsMETROPOLIS
-        || expand->elamstats == elamstatsMINVAR)
-    {
-        maxc = 2 * expand->c_range + 1;
-
-        snew(lam_dg, nlim);
-        snew(lam_variance, nlim);
-
-        snew(omegap_array, maxc);
-        snew(weightsp_array, maxc);
-        snew(varp_array, maxc);
-        snew(dwp_array, maxc);
-
-        snew(omegam_array, maxc);
-        snew(weightsm_array, maxc);
-        snew(varm_array, maxc);
-        snew(dwm_array, maxc);
-
-        /* unpack the values of the free energy differences and the
-         * variance in their estimates between nearby lambdas. We will
-         * only actually update 2 of these, the state we are currently
-         * at and the one we end up moving to
-         */
-
-        for (i = 0; i < nlim - 1; i++)
-        { /* only through the second to last */
-            lam_dg[i] = dfhist->sum_dg[i + 1] - dfhist->sum_dg[i];
-            lam_variance[i] =
-                    gmx::square(dfhist->sum_variance[i + 1]) - gmx::square(dfhist->sum_variance[i]);
-        }
-
-        /* accumulate running averages of thermodynamic averages for Bennett Acceptance Ratio-based
-         * estimates of the free energy .
-         * Rather than peforming self-consistent estimation of the free energies at each step,
-         * we keep track of an array of possible different free energies (cnvals),
-         * and we self-consistently choose the best one. The one that leads to a free energy estimate
-         * that is closest to itself is the best estimate of the free energy.  It is essentially a
-         * parallellized version of self-consistent iteration.  maxc is the number of these constants. */
-
-        for (int nval = 0; nval < maxc; nval++)
-        {
-            const real cnval = static_cast<real>(nval - expand->c_range);
-
-            /* Compute acceptance criterion weight to the state below this one for use in averages.
-             * Note we do not have to have just moved from that state to use this free energy
-             * estimate; these are essentially "virtual" moves. */
-
-            if (fep_state > 0)
-            {
-                const auto lambdaEnergyDifference =
-                        cnval - (scaled_lamee[fep_state] - scaled_lamee[fep_state - 1]);
-                acceptanceWeight =
-                        gmx::calculateAcceptanceWeight(expand->elamstats, lambdaEnergyDifference);
-                dfhist->accum_m[fep_state][nval] += acceptanceWeight;
-                dfhist->accum_m2[fep_state][nval] += acceptanceWeight * acceptanceWeight;
-            }
-
-            // Compute acceptance criterion weight to transition to the next state
-            if (fep_state < nlim - 1)
-            {
-                const auto lambdaEnergyDifference =
-                        -cnval + (scaled_lamee[fep_state + 1] - scaled_lamee[fep_state]);
-                acceptanceWeight =
-                        gmx::calculateAcceptanceWeight(expand->elamstats, lambdaEnergyDifference);
-                dfhist->accum_p[fep_state][nval] += acceptanceWeight;
-                dfhist->accum_p2[fep_state][nval] += acceptanceWeight * acceptanceWeight;
-            }
-
-            /* Determination of Metropolis transition and Barker transition weights */
-
-            int numObservationsCurrentState = dfhist->n_at_lam[fep_state];
-            /* determine the number of observations above and below the current state */
-            int numObservationsLowerState = 0;
-            if (fep_state > 0)
-            {
-                numObservationsLowerState = dfhist->n_at_lam[fep_state - 1];
-            }
-            int numObservationsHigherState = 0;
-            if (fep_state < nlim - 1)
-            {
-                numObservationsHigherState = dfhist->n_at_lam[fep_state + 1];
-            }
-
-            /* Calculate the biases for each expanded ensemble state that minimize the total
-             * variance, as implemented in Martinez-Veracoechea and Escobedo,
-             * J. Phys. Chem. B 2008, 112, 8120-8128
-             *
-             * The variance associated with the free energy estimate between two states i and j
-             * is calculated as
-             *     Var(i,j) = {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} / numObservations(i->j)
-             *              + {avg[xi(j->i)^2] / avg[xi(j->i)]^2 - 1} / numObservations(j->i)
-             * where xi(i->j) is the acceptance factor / weight associated with moving from state i to j
-             * As we are calculating the acceptance factor to the neighbors every time we're visiting
-             * a state, numObservations(i->j) == numObservations(i) and numObservations(j->i) == numObservations(j)
-             */
-
-            /* Accumulation of acceptance weight averages between the current state and the
-             * states +1 (p1) and -1 (m1), averaged at current state (0)
-             */
-            real avgAcceptanceCurrentToLower  = 0;
-            real avgAcceptanceCurrentToHigher = 0;
-            /* Accumulation of acceptance weight averages quantities between states 0
-             *  and states +1 and -1, squared
-             */
-            real avgAcceptanceCurrentToLowerSquared  = 0;
-            real avgAcceptanceCurrentToHigherSquared = 0;
-            /* Accumulation of free energy quantities from lower state (m1) to current state (0) and squared */
-            real avgAcceptanceLowerToCurrent        = 0;
-            real avgAcceptanceLowerToCurrentSquared = 0;
-            /* Accumulation of free energy quantities from upper state (p1) to current state (0) and squared */
-            real avgAcceptanceHigherToCurrent        = 0;
-            real avgAcceptanceHigherToCurrentSquared = 0;
-
-            if (numObservationsCurrentState > 0)
-            {
-                avgAcceptanceCurrentToLower = dfhist->accum_m[fep_state][nval] / numObservationsCurrentState;
-                avgAcceptanceCurrentToHigher =
-                        dfhist->accum_p[fep_state][nval] / numObservationsCurrentState;
-                avgAcceptanceCurrentToLowerSquared =
-                        dfhist->accum_m2[fep_state][nval] / numObservationsCurrentState;
-                avgAcceptanceCurrentToHigherSquared =
-                        dfhist->accum_p2[fep_state][nval] / numObservationsCurrentState;
-            }
-
-            if ((fep_state > 0) && (numObservationsLowerState > 0))
-            {
-                avgAcceptanceLowerToCurrent =
-                        dfhist->accum_p[fep_state - 1][nval] / numObservationsLowerState;
-                avgAcceptanceLowerToCurrentSquared =
-                        dfhist->accum_p2[fep_state - 1][nval] / numObservationsLowerState;
-            }
-
-            if ((fep_state < nlim - 1) && (numObservationsHigherState > 0))
-            {
-                avgAcceptanceHigherToCurrent =
-                        dfhist->accum_m[fep_state + 1][nval] / numObservationsHigherState;
-                avgAcceptanceHigherToCurrentSquared =
-                        dfhist->accum_m2[fep_state + 1][nval] / numObservationsHigherState;
-            }
-            /* These are accumulation of positive values (see definition of acceptance functions
-             * above), or of squares of positive values.
-             * We're taking this for granted in the following calculation, so make sure
-             * here that nothing weird happened. Although technically all values should be positive,
-             * because of floating point precisions, they might be numerically zero. */
-            GMX_RELEASE_ASSERT(
-                    avgAcceptanceCurrentToLower >= 0 && avgAcceptanceCurrentToLowerSquared >= 0
-                            && avgAcceptanceCurrentToHigher >= 0
-                            && avgAcceptanceCurrentToHigherSquared >= 0 && avgAcceptanceLowerToCurrent >= 0
-                            && avgAcceptanceLowerToCurrentSquared >= 0 && avgAcceptanceHigherToCurrent >= 0
-                            && avgAcceptanceHigherToCurrentSquared >= 0,
-                    "By definition, the acceptance factors should all be nonnegative.");
-
-            real varianceCurrentToLower   = 0;
-            real varianceCurrentToHigher  = 0;
-            real weightDifferenceToLower  = 0;
-            real weightDifferenceToHigher = 0;
-            real varianceToLower          = 0;
-            real varianceToHigher         = 0;
-
-            if (fep_state > 0)
-            {
-                if (numObservationsCurrentState > 0)
-                {
-                    /* Calculate {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1}
-                     *
-                     * Note that if avg[xi(i->j)] == 0, also avg[xi(i->j)^2] == 0 (since the
-                     * acceptances are all positive!), and hence
-                     *     {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} -> 0  for  avg[xi(i->j)] -> 0
-                     * We're catching that case explicitly to avoid numerical
-                     * problems dividing by zero when the overlap between states is small (#3304)
-                     */
-                    if (avgAcceptanceCurrentToLower > 0)
-                    {
-                        varianceCurrentToLower =
-                                avgAcceptanceCurrentToLowerSquared
-                                        / (avgAcceptanceCurrentToLower * avgAcceptanceCurrentToLower)
-                                - 1.0;
-                    }
-                    if (numObservationsLowerState > 0)
-                    {
-                        /* Calculate {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1}
-                         *
-                         * Note that if avg[xi(i->j)] == 0, also avg[xi(i->j)^2] == 0 (since the
-                         * acceptances are all positive!), and hence
-                         *     {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} -> 0  for  avg[xi(i->j)] -> 0
-                         * We're catching that case explicitly to avoid numerical
-                         * problems dividing by zero when the overlap between states is small (#3304)
-                         */
-                        real varianceLowerToCurrent = 0;
-                        if (avgAcceptanceLowerToCurrent > 0)
-                        {
-                            varianceLowerToCurrent =
-                                    avgAcceptanceLowerToCurrentSquared
-                                            / (avgAcceptanceLowerToCurrent * avgAcceptanceLowerToCurrent)
-                                    - 1.0;
-                        }
-                        /* Free energy difference to the state one state lower */
-                        /* if these either of these quantities are zero, the energies are */
-                        /* way too large for the dynamic range.  We need an alternate guesstimate */
-                        if ((avgAcceptanceCurrentToLower == 0) || (avgAcceptanceLowerToCurrent == 0))
-                        {
-                            weightDifferenceToLower =
-                                    (scaled_lamee[fep_state] - scaled_lamee[fep_state - 1]);
-                        }
-                        else
-                        {
-                            weightDifferenceToLower = (std::log(avgAcceptanceCurrentToLower)
-                                                       - std::log(avgAcceptanceLowerToCurrent))
-                                                      + cnval;
-                        }
-                        /* Variance of the free energy difference to the one state lower */
-                        varianceToLower =
-                                (1.0 / numObservationsCurrentState) * (varianceCurrentToLower)
-                                + (1.0 / numObservationsLowerState) * (varianceLowerToCurrent);
-                    }
-                }
-            }
-
-            if (fep_state < nlim - 1)
-            {
-                if (numObservationsCurrentState > 0)
-                {
-                    /* Calculate {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1}
-                     *
-                     * Note that if avg[xi(i->j)] == 0, also avg[xi(i->j)^2] == 0 (since the
-                     * acceptances are all positive!), and hence
-                     *     {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} -> 0  for  avg[xi(i->j)] -> 0
-                     * We're catching that case explicitly to avoid numerical
-                     * problems dividing by zero when the overlap between states is small (#3304)
-                     */
-
-                    if (avgAcceptanceCurrentToHigher < 0)
-                    {
-                        varianceCurrentToHigher =
-                                avgAcceptanceCurrentToHigherSquared
-                                        / (avgAcceptanceCurrentToHigher * avgAcceptanceCurrentToHigher)
-                                - 1.0;
-                    }
-                    if (numObservationsHigherState > 0)
-                    {
-                        /* Calculate {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1}
-                         *
-                         * Note that if avg[xi(i->j)] == 0, also avg[xi(i->j)^2] == 0 (since the
-                         * acceptances are all positive!), and hence
-                         *     {avg[xi(i->j)^2] / avg[xi(i->j)]^2 - 1} -> 0  for  avg[xi(i->j)] -> 0
-                         * We're catching that case explicitly to avoid numerical
-                         * problems dividing by zero when the overlap between states is small (#3304)
-                         */
-                        real varianceHigherToCurrent = 0;
-                        if (avgAcceptanceHigherToCurrent > 0)
-                        {
-                            varianceHigherToCurrent =
-                                    avgAcceptanceHigherToCurrentSquared
-                                            / (avgAcceptanceHigherToCurrent * avgAcceptanceHigherToCurrent)
-                                    - 1.0;
-                        }
-                        /* Free energy difference to the state one state higher */
-                        /* if these either of these quantities are zero, the energies are */
-                        /* way too large for the dynamic range.  We need an alternate guesstimate */
-                        if ((avgAcceptanceHigherToCurrent == 0) || (avgAcceptanceCurrentToHigher == 0))
-                        {
-                            weightDifferenceToHigher =
-                                    (scaled_lamee[fep_state + 1] - scaled_lamee[fep_state]);
-                        }
-                        else
-                        {
-                            weightDifferenceToHigher = (std::log(avgAcceptanceHigherToCurrent)
-                                                        - std::log(avgAcceptanceCurrentToHigher))
-                                                       + cnval;
-                        }
-                        /* Variance of the free energy difference to the one state higher */
-                        varianceToHigher =
-                                (1.0 / numObservationsHigherState) * (varianceHigherToCurrent)
-                                + (1.0 / numObservationsCurrentState) * (varianceCurrentToHigher);
-                    }
-                }
-            }
-
-            if (numObservationsCurrentState > 0)
-            {
-                omegam_array[nval] = varianceCurrentToLower;
-            }
-            else
-            {
-                omegam_array[nval] = 0;
-            }
-            weightsm_array[nval] = weightDifferenceToLower;
-            varm_array[nval]     = varianceToLower;
-            if (numObservationsLowerState > 0)
-            {
-                dwm_array[nval] =
-                        fabs((cnval + std::log((1.0 * numObservationsCurrentState) / numObservationsLowerState))
-                             - lam_dg[fep_state - 1]);
-            }
-            else
-            {
-                dwm_array[nval] = std::fabs(cnval - lam_dg[fep_state - 1]);
-            }
-
-            if (numObservationsCurrentState > 0)
-            {
-                omegap_array[nval] = varianceCurrentToHigher;
-            }
-            else
-            {
-                omegap_array[nval] = 0;
-            }
-            weightsp_array[nval] = weightDifferenceToHigher;
-            varp_array[nval]     = varianceToHigher;
-            if ((numObservationsHigherState > 0) && (numObservationsCurrentState > 0))
-            {
-                dwp_array[nval] =
-                        fabs((cnval + std::log((1.0 * numObservationsHigherState) / numObservationsCurrentState))
-                             - lam_dg[fep_state]);
-            }
-            else
-            {
-                dwp_array[nval] = std::fabs(cnval - lam_dg[fep_state]);
-            }
-        }
-
-        /* find the free energy estimate closest to the guessed weight's value */
-
-        min_nvalm     = FindMinimum(dwm_array, maxc);
-        omega_m1_0    = omegam_array[min_nvalm];
-        clam_weightsm = weightsm_array[min_nvalm];
-        clam_varm     = varm_array[min_nvalm];
-
-        min_nvalp     = FindMinimum(dwp_array, maxc);
-        omega_p1_0    = omegap_array[min_nvalp];
-        clam_weightsp = weightsp_array[min_nvalp];
-        clam_varp     = varp_array[min_nvalp];
-
-        clam_osum   = omega_m1_0 + omega_p1_0;
-        clam_minvar = 0;
-        if (clam_osum > 0)
-        {
-            clam_minvar = 0.5 * std::log(clam_osum);
-        }
-
-        if (fep_state > 0)
-        {
-            lam_dg[fep_state - 1]       = clam_weightsm;
-            lam_variance[fep_state - 1] = clam_varm;
-        }
-
-        if (fep_state < nlim - 1)
-        {
-            lam_dg[fep_state]       = clam_weightsp;
-            lam_variance[fep_state] = clam_varp;
-        }
-
-        if (expand->elamstats == elamstatsMINVAR)
-        {
-            bSufficientSamples = TRUE;
-            /* make sure the number of samples in each state are all
-             * past a user-specified threshold
-             */
-            for (i = 0; i < nlim; i++)
-            {
-                if (dfhist->n_at_lam[i] < expand->minvarmin)
-                {
-                    bSufficientSamples = FALSE;
-                }
-            }
-            if (bSufficientSamples)
-            {
-                dfhist->sum_minvar[fep_state] = clam_minvar;
-                if (fep_state == 0)
-                {
-                    for (i = 0; i < nlim; i++)
-                    {
-                        dfhist->sum_minvar[i] += (expand->minvar_const - clam_minvar);
-                    }
-                    expand->minvar_const          = clam_minvar;
-                    dfhist->sum_minvar[fep_state] = 0.0;
-                }
-                else
-                {
-                    dfhist->sum_minvar[fep_state] -= expand->minvar_const;
-                }
-            }
-        }
-
-        /* we need to rezero minvar now, since it could change at fep_state = 0 */
-        dfhist->sum_dg[0]       = 0.0;
-        dfhist->sum_variance[0] = 0.0;
-        dfhist->sum_weights[0]  = dfhist->sum_dg[0] + dfhist->sum_minvar[0]; /* should be zero */
-
-        for (i = 1; i < nlim; i++)
-        {
-            dfhist->sum_dg[i] = lam_dg[i - 1] + dfhist->sum_dg[i - 1];
-            dfhist->sum_variance[i] =
-                    std::sqrt(lam_variance[i - 1] + gmx::square(dfhist->sum_variance[i - 1]));
-            dfhist->sum_weights[i] = dfhist->sum_dg[i] + dfhist->sum_minvar[i];
-        }
-
-        sfree(lam_dg);
-        sfree(lam_variance);
-
-        sfree(omegam_array);
-        sfree(weightsm_array);
-        sfree(varm_array);
-        sfree(dwm_array);
-
-        sfree(omegap_array);
-        sfree(weightsp_array);
-        sfree(varp_array);
-        sfree(dwp_array);
-    }
-    return FALSE;
-}
-
-static int ChooseNewLambda(int               nlim,
-                           const t_expanded* expand,
-                           df_history_t*     dfhist,
-                           int               fep_state,
-                           const real*       weighted_lamee,
-                           double*           p_k,
-                           int64_t           seed,
-                           int64_t           step)
-{
-    /* Choose new lambda value, and update transition matrix */
-
-    int                  i, ifep, minfep, maxfep, lamnew, lamtrial, starting_fep_state;
-    real                 r1, r2, de, trialprob, tprob = 0;
-    double *             propose, *accept, *remainder;
-    double               pks;
-    real                 pnorm;
-    gmx::ThreeFry2x64<0> rng(
-            seed, gmx::RandomDomain::ExpandedEnsemble); // We only draw once, so zero bits internal counter is fine
-    gmx::UniformRealDistribution<real> dist;
-
-    starting_fep_state = fep_state;
-    lamnew             = fep_state; /* so that there is a default setting -- stays the same */
-
-    if (!EWL(expand->elamstats)) /* ignore equilibrating the weights if using WL */
-    {
-        if ((expand->lmc_forced_nstart > 0) && (dfhist->n_at_lam[nlim - 1] <= expand->lmc_forced_nstart))
-        {
-            /* Use a marching method to run through the lambdas and get preliminary free energy data,
-               before starting 'free' sampling.  We start free sampling when we have enough at each lambda */
-
-            /* if we have enough at this lambda, move on to the next one */
-
-            if (dfhist->n_at_lam[fep_state] == expand->lmc_forced_nstart)
-            {
-                lamnew = fep_state + 1;
-                if (lamnew == nlim) /* whoops, stepped too far! */
-                {
-                    lamnew -= 1;
-                }
-            }
-            else
-            {
-                lamnew = fep_state;
-            }
-            return lamnew;
-        }
-    }
-
-    snew(propose, nlim);
-    snew(accept, nlim);
-    snew(remainder, nlim);
-
-    for (i = 0; i < expand->lmc_repeats; i++)
-    {
-        rng.restart(step, i);
-        dist.reset();
-
-        for (ifep = 0; ifep < nlim; ifep++)
-        {
-            propose[ifep] = 0;
-            accept[ifep]  = 0;
-        }
-
-        if ((expand->elmcmove == elmcmoveGIBBS) || (expand->elmcmove == elmcmoveMETGIBBS))
-        {
-            /* use the Gibbs sampler, with restricted range */
-            if (expand->gibbsdeltalam < 0)
-            {
-                minfep = 0;
-                maxfep = nlim - 1;
-            }
-            else
-            {
-                minfep = fep_state - expand->gibbsdeltalam;
-                maxfep = fep_state + expand->gibbsdeltalam;
-                if (minfep < 0)
-                {
-                    minfep = 0;
-                }
-                if (maxfep > nlim - 1)
-                {
-                    maxfep = nlim - 1;
-                }
-            }
-
-            GenerateGibbsProbabilities(weighted_lamee, p_k, &pks, minfep, maxfep);
-
-            if (expand->elmcmove == elmcmoveGIBBS)
-            {
-                for (ifep = minfep; ifep <= maxfep; ifep++)
-                {
-                    propose[ifep] = p_k[ifep];
-                    accept[ifep]  = 1.0;
-                }
-                /* Gibbs sampling */
-                r1 = dist(rng);
-                for (lamnew = minfep; lamnew <= maxfep; lamnew++)
-                {
-                    if (r1 <= p_k[lamnew])
-                    {
-                        break;
-                    }
-                    r1 -= p_k[lamnew];
-                }
-            }
-            else if (expand->elmcmove == elmcmoveMETGIBBS)
-            {
-
-                /* Metropolized Gibbs sampling */
-                for (ifep = minfep; ifep <= maxfep; ifep++)
-                {
-                    remainder[ifep] = 1 - p_k[ifep];
-                }
-
-                /* find the proposal probabilities */
-
-                if (remainder[fep_state] == 0)
-                {
-                    /* only the current state has any probability */
-                    /* we have to stay at the current state */
-                    lamnew = fep_state;
-                }
-                else
-                {
-                    for (ifep = minfep; ifep <= maxfep; ifep++)
-                    {
-                        if (ifep != fep_state)
-                        {
-                            propose[ifep] = p_k[ifep] / remainder[fep_state];
-                        }
-                        else
-                        {
-                            propose[ifep] = 0;
-                        }
-                    }
-
-                    r1 = dist(rng);
-                    for (lamtrial = minfep; lamtrial <= maxfep; lamtrial++)
-                    {
-                        pnorm = p_k[lamtrial] / remainder[fep_state];
-                        if (lamtrial != fep_state)
-                        {
-                            if (r1 <= pnorm)
-                            {
-                                break;
-                            }
-                            r1 -= pnorm;
-                        }
-                    }
-
-                    /* we have now selected lamtrial according to p(lamtrial)/1-p(fep_state) */
-                    tprob = 1.0;
-                    /* trial probability is min{1,\frac{1 - p(old)}{1-p(new)} MRS 1/8/2008 */
-                    trialprob = (remainder[fep_state]) / (remainder[lamtrial]);
-                    if (trialprob < tprob)
-                    {
-                        tprob = trialprob;
-                    }
-                    r2 = dist(rng);
-                    if (r2 < tprob)
-                    {
-                        lamnew = lamtrial;
-                    }
-                    else
-                    {
-                        lamnew = fep_state;
-                    }
-                }
-
-                /* now figure out the acceptance probability for each */
-                for (ifep = minfep; ifep <= maxfep; ifep++)
-                {
-                    tprob = 1.0;
-                    if (remainder[ifep] != 0)
-                    {
-                        trialprob = (remainder[fep_state]) / (remainder[ifep]);
-                    }
-                    else
-                    {
-                        trialprob = 1.0; /* this state is the only choice! */
-                    }
-                    if (trialprob < tprob)
-                    {
-                        tprob = trialprob;
-                    }
-                    /* probability for fep_state=0, but that's fine, it's never proposed! */
-                    accept[ifep] = tprob;
-                }
-            }
-
-            if (lamnew > maxfep)
-            {
-                /* it's possible some rounding is failing */
-                if (gmx_within_tol(remainder[fep_state], 0, 50 * GMX_DOUBLE_EPS))
-                {
-                    /* numerical rounding error -- no state other than the original has weight */
-                    lamnew = fep_state;
-                }
-                else
-                {
-                    /* probably not a numerical issue */
-                    int   loc    = 0;
-                    int   nerror = 200 + (maxfep - minfep + 1) * 60;
-                    char* errorstr;
-                    snew(errorstr, nerror);
-                    /* if its greater than maxfep, then something went wrong -- probably underflow
-                       in the calculation of sum weights. Generated detailed info for failure */
-                    loc += sprintf(
-                            errorstr,
-                            "Something wrong in choosing new lambda state with a Gibbs move -- "
-                            "probably underflow in weight determination.\nDenominator is: "
-                            "%3d%17.10e\n  i                dE        numerator          weights\n",
-                            0, pks);
-                    for (ifep = minfep; ifep <= maxfep; ifep++)
-                    {
-                        loc += sprintf(&errorstr[loc], "%3d %17.10e%17.10e%17.10e\n", ifep,
-                                       weighted_lamee[ifep], p_k[ifep], dfhist->sum_weights[ifep]);
-                    }
-                    gmx_fatal(FARGS, "%s", errorstr);
-                }
-            }
-        }
-        else if ((expand->elmcmove == elmcmoveMETROPOLIS) || (expand->elmcmove == elmcmoveBARKER))
-        {
-            /* use the metropolis sampler with trial +/- 1 */
-            r1 = dist(rng);
-            if (r1 < 0.5)
-            {
-                if (fep_state == 0)
-                {
-                    lamtrial = fep_state;
-                }
-                else
-                {
-                    lamtrial = fep_state - 1;
-                }
-            }
-            else
-            {
-                if (fep_state == nlim - 1)
-                {
-                    lamtrial = fep_state;
-                }
-                else
-                {
-                    lamtrial = fep_state + 1;
-                }
-            }
-
-            de = weighted_lamee[lamtrial] - weighted_lamee[fep_state];
-            if (expand->elmcmove == elmcmoveMETROPOLIS)
-            {
-                tprob = 1.0;
-                if (de < 0)
-                {
-                    tprob = std::exp(de);
-                }
-                propose[fep_state] = 0;
-                propose[lamtrial]  = 1.0; /* note that this overwrites the above line if fep_state = ntrial, which only occurs at the ends */
-                accept[fep_state] =
-                        1.0; /* doesn't actually matter, never proposed unless fep_state = ntrial, in which case it's 1.0 anyway */
-                accept[lamtrial] = tprob;
-            }
-            else if (expand->elmcmove == elmcmoveBARKER)
-            {
-                if (de > 0) /* Numerically stable version */
-                {
-                    tprob = 1.0 / (1.0 + std::exp(-de));
-                }
-                else if (de < 0)
-                {
-                    tprob = std::exp(de) / (std::exp(de) + 1.0);
-                }
-                propose[fep_state] = (1 - tprob);
-                propose[lamtrial] +=
-                        tprob; /* we add, to account for the fact that at the end, they might be the same point */
-                accept[fep_state] = 1.0;
-                accept[lamtrial]  = 1.0;
-            }
-
-            r2 = dist(rng);
-            if (r2 < tprob)
-            {
-                lamnew = lamtrial;
-            }
-            else
-            {
-                lamnew = fep_state;
-            }
-        }
-
-        for (ifep = 0; ifep < nlim; ifep++)
-        {
-            dfhist->Tij[fep_state][ifep] += propose[ifep] * accept[ifep];
-            dfhist->Tij[fep_state][fep_state] += propose[ifep] * (1.0 - accept[ifep]);
-        }
-        fep_state = lamnew;
-    }
-
-    dfhist->Tij_empirical[starting_fep_state][lamnew] += 1.0;
-
-    sfree(propose);
-    sfree(accept);
-    sfree(remainder);
-
-    return lamnew;
-}
-
-/* print out the weights to the log, along with current state */
-void PrintFreeEnergyInfoToFile(FILE*               outfile,
-                               const t_lambda*     fep,
-                               const t_expanded*   expand,
-                               const t_simtemp*    simtemp,
-                               const df_history_t* dfhist,
-                               int                 fep_state,
-                               int                 frequency,
-                               int64_t             step)
-{
-    int         nlim, i, ifep, jfep;
-    real        dw, dg, dv, Tprint;
-    const char* print_names[efptNR] = { " FEPL", "MassL", "CoulL",   " VdwL",
-                                        "BondL", "RestT", "Temp.(K)" };
-    gmx_bool    bSimTemp            = FALSE;
-
-    nlim = fep->n_lambda;
-    if (simtemp != nullptr)
-    {
-        bSimTemp = TRUE;
-    }
-
-    if (step % frequency == 0)
-    {
-        fprintf(outfile, "             MC-lambda information\n");
-        if (EWL(expand->elamstats) && (!(dfhist->bEquil)))
-        {
-            fprintf(outfile, "  Wang-Landau incrementor is: %11.5g\n", dfhist->wl_delta);
-        }
-        fprintf(outfile, "  N");
-        for (i = 0; i < efptNR; i++)
-        {
-            if (fep->separate_dvdl[i])
-            {
-                fprintf(outfile, "%7s", print_names[i]);
-            }
-            else if ((i == efptTEMPERATURE) && bSimTemp)
-            {
-                fprintf(outfile, "%10s", print_names[i]); /* more space for temperature formats */
-            }
-        }
-        fprintf(outfile, "    Count   ");
-        if (expand->elamstats == elamstatsMINVAR)
-        {
-            fprintf(outfile, "W(in kT)   G(in kT)  dG(in kT)  dV(in kT)\n");
-        }
-        else
-        {
-            fprintf(outfile, "G(in kT)  dG(in kT)\n");
-        }
-        for (ifep = 0; ifep < nlim; ifep++)
-        {
-            if (ifep == nlim - 1)
-            {
-                dw = 0.0;
-                dg = 0.0;
-                dv = 0.0;
-            }
-            else
-            {
-                dw = dfhist->sum_weights[ifep + 1] - dfhist->sum_weights[ifep];
-                dg = dfhist->sum_dg[ifep + 1] - dfhist->sum_dg[ifep];
-                dv = std::sqrt(gmx::square(dfhist->sum_variance[ifep + 1])
-                               - gmx::square(dfhist->sum_variance[ifep]));
-            }
-            fprintf(outfile, "%3d", (ifep + 1));
-            for (i = 0; i < efptNR; i++)
-            {
-                if (fep->separate_dvdl[i])
-                {
-                    fprintf(outfile, "%7.3f", fep->all_lambda[i][ifep]);
-                }
-                else if (i == efptTEMPERATURE && bSimTemp)
-                {
-                    fprintf(outfile, "%9.3f", simtemp->temperatures[ifep]);
-                }
-            }
-            if (EWL(expand->elamstats)
-                && (!(dfhist->bEquil))) /* if performing WL and still haven't equilibrated */
-            {
-                if (expand->elamstats == elamstatsWL)
-                {
-                    fprintf(outfile, " %8d", static_cast<int>(dfhist->wl_histo[ifep]));
-                }
-                else
-                {
-                    fprintf(outfile, " %8.3f", dfhist->wl_histo[ifep]);
-                }
-            }
-            else /* we have equilibrated weights */
-            {
-                fprintf(outfile, " %8d", dfhist->n_at_lam[ifep]);
-            }
-            if (expand->elamstats == elamstatsMINVAR)
-            {
-                fprintf(outfile, " %10.5f %10.5f %10.5f %10.5f", dfhist->sum_weights[ifep],
-                        dfhist->sum_dg[ifep], dg, dv);
-            }
-            else
-            {
-                fprintf(outfile, " %10.5f %10.5f", dfhist->sum_weights[ifep], dw);
-            }
-            if (ifep == fep_state)
-            {
-                fprintf(outfile, " <<\n");
-            }
-            else
-            {
-                fprintf(outfile, "   \n");
-            }
-        }
-        fprintf(outfile, "\n");
-
-        if ((step % expand->nstTij == 0) && (expand->nstTij > 0) && (step > 0))
-        {
-            fprintf(outfile, "                     Transition Matrix\n");
-            for (ifep = 0; ifep < nlim; ifep++)
-            {
-                fprintf(outfile, "%12d", (ifep + 1));
-            }
-            fprintf(outfile, "\n");
-            for (ifep = 0; ifep < nlim; ifep++)
-            {
-                for (jfep = 0; jfep < nlim; jfep++)
-                {
-                    if (dfhist->n_at_lam[ifep] > 0)
-                    {
-                        if (expand->bSymmetrizedTMatrix)
-                        {
-                            Tprint = (dfhist->Tij[ifep][jfep] + dfhist->Tij[jfep][ifep])
-                                     / (dfhist->n_at_lam[ifep] + dfhist->n_at_lam[jfep]);
-                        }
-                        else
-                        {
-                            Tprint = (dfhist->Tij[ifep][jfep]) / (dfhist->n_at_lam[ifep]);
-                        }
-                    }
-                    else
-                    {
-                        Tprint = 0.0;
-                    }
-                    fprintf(outfile, "%12.8f", Tprint);
-                }
-                fprintf(outfile, "%3d\n", (ifep + 1));
-            }
-
-            fprintf(outfile, "                  Empirical Transition Matrix\n");
-            for (ifep = 0; ifep < nlim; ifep++)
-            {
-                fprintf(outfile, "%12d", (ifep + 1));
-            }
-            fprintf(outfile, "\n");
-            for (ifep = 0; ifep < nlim; ifep++)
-            {
-                for (jfep = 0; jfep < nlim; jfep++)
-                {
-                    if (dfhist->n_at_lam[ifep] > 0)
-                    {
-                        if (expand->bSymmetrizedTMatrix)
-                        {
-                            Tprint = (dfhist->Tij_empirical[ifep][jfep] + dfhist->Tij_empirical[jfep][ifep])
-                                     / (dfhist->n_at_lam[ifep] + dfhist->n_at_lam[jfep]);
-                        }
-                        else
-                        {
-                            Tprint = dfhist->Tij_empirical[ifep][jfep] / (dfhist->n_at_lam[ifep]);
-                        }
-                    }
-                    else
-                    {
-                        Tprint = 0.0;
-                    }
-                    fprintf(outfile, "%12.8f", Tprint);
-                }
-                fprintf(outfile, "%3d\n", (ifep + 1));
-            }
-        }
-    }
-}
-
-int ExpandedEnsembleDynamics(FILE*                 log,
-                             const t_inputrec*     ir,
-                             const gmx_enerdata_t* enerd,
-                             t_state*              state,
-                             t_extmass*            MassQ,
-                             int                   fep_state,
-                             df_history_t*         dfhist,
-                             int64_t               step,
-                             rvec*                 v,
-                             const t_mdatoms*      mdatoms)
-/* Note that the state variable is only needed for simulated tempering, not
-   Hamiltonian expanded ensemble.  May be able to remove it after integrator refactoring. */
-{
-    real *      pfep_lamee, *scaled_lamee, *weighted_lamee;
-    double*     p_k;
-    int         i, nlim, lamnew, totalsamples;
-    real        oneovert, maxscaled = 0, maxweighted = 0;
-    t_expanded* expand;
-    t_simtemp*  simtemp;
-    gmx_bool    bIfReset, bSwitchtoOneOverT, bDoneEquilibrating = FALSE;
-
-    expand  = ir->expandedvals;
-    simtemp = ir->simtempvals;
-    nlim    = ir->fepvals->n_lambda;
-
-    snew(scaled_lamee, nlim);
-    snew(weighted_lamee, nlim);
-    snew(pfep_lamee, nlim);
-    snew(p_k, nlim);
-
-    /* update the count at the current lambda*/
-    dfhist->n_at_lam[fep_state]++;
-
-    /* need to calculate the PV term somewhere, but not needed here? Not until there's a lambda
-       state that's pressure controlled.*/
-    /*
-       pVTerm = 0;
-       where does this PV term go?
-       for (i=0;i<nlim;i++)
-       {
-       fep_lamee[i] += pVTerm;
-       }
-     */
-
-    /* determine the minimum value to avoid overflow.  Probably a better way to do this */
-    /* we don't need to include the pressure term, since the volume is the same between the two.
-       is there some term we are neglecting, however? */
-
-    if (ir->efep != efepNO)
-    {
-        for (i = 0; i < nlim; i++)
-        {
-            if (ir->bSimTemp)
-            {
-                /* Note -- this assumes no mass changes, since kinetic energy is not added  . . . */
-                scaled_lamee[i] = enerd->foreignLambdaTerms.deltaH(i) / (simtemp->temperatures[i] * BOLTZ)
-                                  + enerd->term[F_EPOT]
-                                            * (1.0 / (simtemp->temperatures[i])
-                                               - 1.0 / (simtemp->temperatures[fep_state]))
-                                            / BOLTZ;
-            }
-            else
-            {
-                scaled_lamee[i] = enerd->foreignLambdaTerms.deltaH(i) / (expand->mc_temp * BOLTZ);
-                /* mc_temp is currently set to the system reft unless otherwise defined */
-            }
-
-            /* save these energies for printing, so they don't get overwritten by the next step */
-            /* they aren't overwritten in the non-free energy case, but we always print with these
-               for simplicity */
-        }
-    }
-    else
-    {
-        if (ir->bSimTemp)
-        {
-            for (i = 0; i < nlim; i++)
-            {
-                scaled_lamee[i] =
-                        enerd->term[F_EPOT]
-                        * (1.0 / simtemp->temperatures[i] - 1.0 / simtemp->temperatures[fep_state]) / BOLTZ;
-            }
-        }
-    }
-
-    for (i = 0; i < nlim; i++)
-    {
-        pfep_lamee[i] = scaled_lamee[i];
-
-        weighted_lamee[i] = dfhist->sum_weights[i] - scaled_lamee[i];
-        if (i == 0)
-        {
-            maxscaled   = scaled_lamee[i];
-            maxweighted = weighted_lamee[i];
-        }
-        else
-        {
-            if (scaled_lamee[i] > maxscaled)
-            {
-                maxscaled = scaled_lamee[i];
-            }
-            if (weighted_lamee[i] > maxweighted)
-            {
-                maxweighted = weighted_lamee[i];
-            }
-        }
-    }
-
-    for (i = 0; i < nlim; i++)
-    {
-        scaled_lamee[i] -= maxscaled;
-        weighted_lamee[i] -= maxweighted;
-    }
-
-    /* update weights - we decide whether or not to actually do this inside */
-
-    bDoneEquilibrating =
-            UpdateWeights(nlim, expand, dfhist, fep_state, scaled_lamee, weighted_lamee, step);
-    if (bDoneEquilibrating)
-    {
-        if (log)
-        {
-            fprintf(log, "\nStep %" PRId64 ": Weights have equilibrated, using criteria: %s\n",
-                    step, elmceq_names[expand->elmceq]);
-        }
-    }
-
-    lamnew = ChooseNewLambda(nlim, expand, dfhist, fep_state, weighted_lamee, p_k,
-                             ir->expandedvals->lmc_seed, step);
-    /* if using simulated tempering, we need to adjust the temperatures */
-    if (ir->bSimTemp && (lamnew != fep_state)) /* only need to change the temperatures if we change the state */
-    {
-        int   i, j, n, d;
-        real* buf_ngtc;
-        real  told;
-        int   nstart, nend, gt;
-
-        snew(buf_ngtc, ir->opts.ngtc);
-
-        for (i = 0; i < ir->opts.ngtc; i++)
-        {
-            if (ir->opts.ref_t[i] > 0)
-            {
-                told              = ir->opts.ref_t[i];
-                ir->opts.ref_t[i] = simtemp->temperatures[lamnew];
-                buf_ngtc[i]       = std::sqrt(ir->opts.ref_t[i] / told); /* using the buffer as temperature scaling */
-            }
-        }
-
-        /* we don't need to manipulate the ekind information, as it isn't due to be reset until the next step anyway */
-
-        nstart = 0;
-        nend   = mdatoms->homenr;
-        for (n = nstart; n < nend; n++)
-        {
-            gt = 0;
-            if (mdatoms->cTC)
-            {
-                gt = mdatoms->cTC[n];
-            }
-            for (d = 0; d < DIM; d++)
-            {
-                v[n][d] *= buf_ngtc[gt];
-            }
-        }
-
-        if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir))
-        {
-            /* we need to recalculate the masses if the temperature has changed */
-            init_npt_masses(ir, state, MassQ, FALSE);
-            for (i = 0; i < state->nnhpres; i++)
-            {
-                for (j = 0; j < ir->opts.nhchainlength; j++)
-                {
-                    state->nhpres_vxi[i + j] *= buf_ngtc[i];
-                }
-            }
-            for (i = 0; i < ir->opts.ngtc; i++)
-            {
-                for (j = 0; j < ir->opts.nhchainlength; j++)
-                {
-                    state->nosehoover_vxi[i + j] *= buf_ngtc[i];
-                }
-            }
-        }
-        sfree(buf_ngtc);
-    }
-
-    /* now check on the Wang-Landau updating critera */
-
-    if (EWL(expand->elamstats))
-    {
-        bSwitchtoOneOverT = FALSE;
-        if (expand->bWLoneovert)
-        {
-            totalsamples = 0;
-            for (i = 0; i < nlim; i++)
-            {
-                totalsamples += dfhist->n_at_lam[i];
-            }
-            oneovert = (1.0 * nlim) / totalsamples;
-            /* oneovert has decreasd by a bit since last time, so we actually make sure its within one of this number */
-            /* switch to 1/t incrementing when wl_delta has decreased at least once, and wl_delta is now less than 1/t */
-            if ((dfhist->wl_delta <= ((totalsamples) / (totalsamples - 1.00001)) * oneovert)
-                && (dfhist->wl_delta < expand->init_wl_delta))
-            {
-                bSwitchtoOneOverT = TRUE;
-            }
-        }
-        if (bSwitchtoOneOverT)
-        {
-            dfhist->wl_delta =
-                    oneovert; /* now we reduce by this each time, instead of only at flatness */
-        }
-        else
-        {
-            bIfReset = CheckHistogramRatios(nlim, dfhist->wl_histo, expand->wl_ratio);
-            if (bIfReset)
-            {
-                for (i = 0; i < nlim; i++)
-                {
-                    dfhist->wl_histo[i] = 0;
-                }
-                dfhist->wl_delta *= expand->wl_scale;
-                if (log)
-                {
-                    fprintf(log, "\nStep %d: weights are now:", static_cast<int>(step));
-                    for (i = 0; i < nlim; i++)
-                    {
-                        fprintf(log, " %.5f", dfhist->sum_weights[i]);
-                    }
-                    fprintf(log, "\n");
-                }
-            }
-        }
-    }
-    sfree(pfep_lamee);
-    sfree(scaled_lamee);
-    sfree(weighted_lamee);
-    sfree(p_k);
-
-    return lamnew;
-}
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.h b/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.h
deleted file mode 100644
index 7766a864fd..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_MDLIB_EXPANDED_H
-#define GMX_MDLIB_EXPANDED_H
-
-#include <stdio.h>
-
-#include "gromacs/math/vectypes.h"
-#include "gromacs/utility/basedefinitions.h"
-
-struct df_history_t;
-struct gmx_enerdata_t;
-struct t_expanded;
-struct t_extmass;
-struct t_inputrec;
-struct t_lambda;
-struct t_mdatoms;
-struct t_simtemp;
-class t_state;
-
-namespace gmx
-{
-class MDLogger;
-} // namespace gmx
-
-void init_npt_masses(const t_inputrec* ir, t_state* state, t_extmass* MassQ, gmx_bool bInit);
-
-void init_expanded_ensemble(gmx_bool bStateFromCP, const t_inputrec* ir, df_history_t* dfhist, const gmx::MDLogger& mdlog);
-
-int ExpandedEnsembleDynamics(FILE*                 log,
-                             const t_inputrec*     ir,
-                             const gmx_enerdata_t* enerd,
-                             t_state*              state,
-                             t_extmass*            MassQ,
-                             int                   fep_state,
-                             df_history_t*         dfhist,
-                             int64_t               step,
-                             rvec*                 v,
-                             const t_mdatoms*      mdatoms,
-                             real*                 realFepState);
-
-void PrintFreeEnergyInfoToFile(FILE*               outfile,
-                               const t_lambda*     fep,
-                               const t_expanded*   expand,
-                               const t_simtemp*    simtemp,
-                               const df_history_t* dfhist,
-                               int                 fep_state,
-                               int                 frequency,
-                               int64_t             step);
-
-#endif
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.h.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.h.preplumed
deleted file mode 100644
index 6f6bec9804..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/expanded.h.preplumed
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_MDLIB_EXPANDED_H
-#define GMX_MDLIB_EXPANDED_H
-
-#include <stdio.h>
-
-#include "gromacs/math/vectypes.h"
-#include "gromacs/utility/basedefinitions.h"
-
-struct df_history_t;
-struct gmx_enerdata_t;
-struct t_expanded;
-struct t_extmass;
-struct t_inputrec;
-struct t_lambda;
-struct t_mdatoms;
-struct t_simtemp;
-class t_state;
-
-void init_npt_masses(const t_inputrec* ir, t_state* state, t_extmass* MassQ, gmx_bool bInit);
-
-void init_expanded_ensemble(gmx_bool bStateFromCP, const t_inputrec* ir, df_history_t* dfhist);
-
-int ExpandedEnsembleDynamics(FILE*                 log,
-                             const t_inputrec*     ir,
-                             const gmx_enerdata_t* enerd,
-                             t_state*              state,
-                             t_extmass*            MassQ,
-                             int                   fep_state,
-                             df_history_t*         dfhist,
-                             int64_t               step,
-                             rvec*                 v,
-                             const t_mdatoms*      mdatoms);
-
-void PrintFreeEnergyInfoToFile(FILE*               outfile,
-                               const t_lambda*     fep,
-                               const t_expanded*   expand,
-                               const t_simtemp*    simtemp,
-                               const df_history_t* dfhist,
-                               int                 fep_state,
-                               int                 frequency,
-                               int64_t             step);
-
-#endif
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/sim_util.cpp b/patches/gromacs-2021.7.diff/src/gromacs/mdlib/sim_util.cpp
deleted file mode 100644
index 69625102ea..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/sim_util.cpp
+++ /dev/null
@@ -1,2178 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include "config.h"
-
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-
-#include <array>
-#include <optional>
-
-#include "gromacs/applied_forces/awh/awh.h"
-#include "gromacs/domdec/dlbtiming.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/gpuhaloexchange.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/essentialdynamics/edsam.h"
-#include "gromacs/ewald/pme.h"
-#include "gromacs/ewald/pme_pp.h"
-#include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
-#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
-#include "gromacs/gmxlib/nonbonded/nonbonded.h"
-#include "gromacs/gpu_utils/gpu_utils.h"
-#include "gromacs/imd/imd.h"
-#include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/gpubonded.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/listed_forces/orires.h"
-#include "gromacs/math/arrayrefwithpadding.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/math/vecdump.h"
-#include "gromacs/mdlib/calcmu.h"
-#include "gromacs/mdlib/calcvir.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/dispersioncorrection.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdlib/wall.h"
-#include "gromacs/mdlib/wholemoleculetransform.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/enerdata.h"
-#include "gromacs/mdtypes/forcebuffers.h"
-#include "gromacs/mdtypes/forceoutput.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/iforceprovider.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/multipletimestepping.h"
-#include "gromacs/mdtypes/simulation_workload.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/mdtypes/state_propagator_data_gpu.h"
-#include "gromacs/nbnxm/gpu_data_mgmt.h"
-#include "gromacs/nbnxm/nbnxm.h"
-#include "gromacs/nbnxm/nbnxm_gpu.h"
-#include "gromacs/pbcutil/ishift.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/pulling/pull_rotation.h"
-#include "gromacs/timing/cyclecounter.h"
-#include "gromacs/timing/gpu_timing.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/wallcyclereporting.h"
-#include "gromacs/timing/walltime_accounting.h"
-#include "gromacs/topology/topology.h"
-#include "gromacs/utility/arrayref.h"
-#include "gromacs/utility/basedefinitions.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/exceptions.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/fixedcapacityvector.h"
-#include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/smalloc.h"
-#include "gromacs/utility/strconvert.h"
-#include "gromacs/utility/sysinfo.h"
-
-#include "gpuforcereduction.h"
-
-using gmx::ArrayRef;
-using gmx::AtomLocality;
-using gmx::DomainLifetimeWorkload;
-using gmx::ForceOutputs;
-using gmx::ForceWithShiftForces;
-using gmx::InteractionLocality;
-using gmx::RVec;
-using gmx::SimulationWorkload;
-using gmx::StepWorkload;
-
-/* PLUMED */
-#include "../../../Plumed.h"
-extern int    plumedswitch;
-extern plumed plumedmain;
-/* END PLUMED */
-
-// TODO: this environment variable allows us to verify before release
-// that on less common architectures the total cost of polling is not larger than
-// a blocking wait (so polling does not introduce overhead when the static
-// PME-first ordering would suffice).
-static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
-
-static void sum_forces(ArrayRef<RVec> f, ArrayRef<const RVec> forceToAdd)
-{
-    GMX_ASSERT(f.size() >= forceToAdd.size(), "Accumulation buffer should be sufficiently large");
-    const int end = forceToAdd.size();
-
-    int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
-#pragma omp parallel for num_threads(nt) schedule(static)
-    for (int i = 0; i < end; i++)
-    {
-        rvec_inc(f[i], forceToAdd[i]);
-    }
-}
-
-static void calc_virial(int                              start,
-                        int                              homenr,
-                        const rvec                       x[],
-                        const gmx::ForceWithShiftForces& forceWithShiftForces,
-                        tensor                           vir_part,
-                        const matrix                     box,
-                        t_nrnb*                          nrnb,
-                        const t_forcerec*                fr,
-                        PbcType                          pbcType)
-{
-    /* The short-range virial from surrounding boxes */
-    const rvec* fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
-    calc_vir(SHIFTS, fr->shift_vec, fshift, vir_part, pbcType == PbcType::Screw, box);
-    inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
-
-    /* Calculate partial virial, for local atoms only, based on short range.
-     * Total virial is computed in global_stat, called from do_md
-     */
-    const rvec* f = as_rvec_array(forceWithShiftForces.force().data());
-    f_calc_vir(start, start + homenr, x, f, vir_part, box);
-    inc_nrnb(nrnb, eNR_VIRIAL, homenr);
-
-    if (debug)
-    {
-        pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
-    }
-}
-
-static void pull_potential_wrapper(const t_commrec*               cr,
-                                   const t_inputrec*              ir,
-                                   const matrix                   box,
-                                   gmx::ArrayRef<const gmx::RVec> x,
-                                   gmx::ForceWithVirial*          force,
-                                   const t_mdatoms*               mdatoms,
-                                   gmx_enerdata_t*                enerd,
-                                   pull_t*                        pull_work,
-                                   const real*                    lambda,
-                                   double                         t,
-                                   gmx_wallcycle_t                wcycle)
-{
-    t_pbc pbc;
-    real  dvdl;
-
-    /* Calculate the center of mass forces, this requires communication,
-     * which is why pull_potential is called close to other communication.
-     */
-    wallcycle_start(wcycle, ewcPULLPOT);
-    set_pbc(&pbc, ir->pbcType, box);
-    dvdl = 0;
-    enerd->term[F_COM_PULL] +=
-            pull_potential(pull_work, mdatoms->massT, &pbc, cr, t, lambda[efptRESTRAINT],
-                           as_rvec_array(x.data()), force, &dvdl);
-    enerd->dvdl_lin[efptRESTRAINT] += dvdl;
-    wallcycle_stop(wcycle, ewcPULLPOT);
-}
-
-static void pme_receive_force_ener(t_forcerec*           fr,
-                                   const t_commrec*      cr,
-                                   gmx::ForceWithVirial* forceWithVirial,
-                                   gmx_enerdata_t*       enerd,
-                                   bool                  useGpuPmePpComms,
-                                   bool                  receivePmeForceToGpu,
-                                   gmx_wallcycle_t       wcycle)
-{
-    real  e_q, e_lj, dvdl_q, dvdl_lj;
-    float cycles_ppdpme, cycles_seppme;
-
-    cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
-    dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
-
-    /* In case of node-splitting, the PP nodes receive the long-range
-     * forces, virial and energy from the PME nodes here.
-     */
-    wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
-    dvdl_q  = 0;
-    dvdl_lj = 0;
-    gmx_pme_receive_f(fr->pmePpCommGpu.get(), cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
-                      useGpuPmePpComms, receivePmeForceToGpu, &cycles_seppme);
-    enerd->term[F_COUL_RECIP] += e_q;
-    enerd->term[F_LJ_RECIP] += e_lj;
-    enerd->dvdl_lin[efptCOUL] += dvdl_q;
-    enerd->dvdl_lin[efptVDW] += dvdl_lj;
-
-    if (wcycle)
-    {
-        dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
-    }
-    wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
-}
-
-static void print_large_forces(FILE*                fp,
-                               const t_mdatoms*     md,
-                               const t_commrec*     cr,
-                               int64_t              step,
-                               real                 forceTolerance,
-                               ArrayRef<const RVec> x,
-                               ArrayRef<const RVec> f)
-{
-    real       force2Tolerance = gmx::square(forceTolerance);
-    gmx::index numNonFinite    = 0;
-    for (int i = 0; i < md->homenr; i++)
-    {
-        real force2    = norm2(f[i]);
-        bool nonFinite = !std::isfinite(force2);
-        if (force2 >= force2Tolerance || nonFinite)
-        {
-            fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n", step,
-                    ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
-        }
-        if (nonFinite)
-        {
-            numNonFinite++;
-        }
-    }
-    if (numNonFinite > 0)
-    {
-        /* Note that with MPI this fatal call on one rank might interrupt
-         * the printing on other ranks. But we can only avoid that with
-         * an expensive MPI barrier that we would need at each step.
-         */
-        gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
-    }
-}
-
-//! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces()
-static void postProcessForceWithShiftForces(t_nrnb*                   nrnb,
-                                            gmx_wallcycle_t           wcycle,
-                                            const matrix              box,
-                                            ArrayRef<const RVec>      x,
-                                            ForceOutputs*             forceOutputs,
-                                            tensor                    vir_force,
-                                            const t_mdatoms&          mdatoms,
-                                            const t_forcerec&         fr,
-                                            gmx::VirtualSitesHandler* vsite,
-                                            const StepWorkload&       stepWork)
-{
-    ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces();
-
-    /* If we have NoVirSum forces, but we do not calculate the virial,
-     * we later sum the forceWithShiftForces buffer together with
-     * the noVirSum buffer and spread the combined vsite forces at once.
-     */
-    if (vsite && (!forceOutputs->haveForceWithVirial() || stepWork.computeVirial))
-    {
-        using VirialHandling = gmx::VirtualSitesHandler::VirialHandling;
-
-        auto                 f      = forceWithShiftForces.force();
-        auto                 fshift = forceWithShiftForces.shiftForces();
-        const VirialHandling virialHandling =
-                (stepWork.computeVirial ? VirialHandling::Pbc : VirialHandling::None);
-        vsite->spreadForces(x, f, virialHandling, fshift, nullptr, nrnb, box, wcycle);
-        forceWithShiftForces.haveSpreadVsiteForces() = true;
-    }
-
-    if (stepWork.computeVirial)
-    {
-        /* Calculation of the virial must be done after vsites! */
-        calc_virial(0, mdatoms.homenr, as_rvec_array(x.data()), forceWithShiftForces, vir_force,
-                    box, nrnb, &fr, fr.pbcType);
-    }
-}
-
-//! Spread, compute virial for and sum forces, when necessary
-static void postProcessForces(const t_commrec*          cr,
-                              int64_t                   step,
-                              t_nrnb*                   nrnb,
-                              gmx_wallcycle_t           wcycle,
-                              const matrix              box,
-                              ArrayRef<const RVec>      x,
-                              ForceOutputs*             forceOutputs,
-                              tensor                    vir_force,
-                              const t_mdatoms*          mdatoms,
-                              const t_forcerec*         fr,
-                              gmx::VirtualSitesHandler* vsite,
-                              const StepWorkload&       stepWork)
-{
-    // Extract the final output force buffer, which is also the buffer for forces with shift forces
-    ArrayRef<RVec> f = forceOutputs->forceWithShiftForces().force();
-
-    if (forceOutputs->haveForceWithVirial())
-    {
-        auto& forceWithVirial = forceOutputs->forceWithVirial();
-
-        if (vsite)
-        {
-            /* Spread the mesh force on virtual sites to the other particles...
-             * This is parallellized. MPI communication is performed
-             * if the constructing atoms aren't local.
-             */
-            GMX_ASSERT(!stepWork.computeVirial || f.data() != forceWithVirial.force_.data(),
-                       "We need separate force buffers for shift and virial forces when "
-                       "computing the virial");
-            GMX_ASSERT(!stepWork.computeVirial
-                               || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
-                       "We should spread the force with shift forces separately when computing "
-                       "the virial");
-            const gmx::VirtualSitesHandler::VirialHandling virialHandling =
-                    (stepWork.computeVirial ? gmx::VirtualSitesHandler::VirialHandling::NonLinear
-                                            : gmx::VirtualSitesHandler::VirialHandling::None);
-            matrix virial = { { 0 } };
-            vsite->spreadForces(x, forceWithVirial.force_, virialHandling, {}, virial, nrnb, box, wcycle);
-            forceWithVirial.addVirialContribution(virial);
-        }
-
-        if (stepWork.computeVirial)
-        {
-            /* Now add the forces, this is local */
-            sum_forces(f, forceWithVirial.force_);
-
-            /* Add the direct virial contributions */
-            GMX_ASSERT(
-                    forceWithVirial.computeVirial_,
-                    "forceWithVirial should request virial computation when we request the virial");
-            m_add(vir_force, forceWithVirial.getVirial(), vir_force);
-
-            if (debug)
-            {
-                pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
-            }
-        }
-    }
-    else
-    {
-        GMX_ASSERT(vsite == nullptr || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
-                   "We should have spread the vsite forces (earlier)");
-    }
-
-    if (fr->print_force >= 0)
-    {
-        print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
-    }
-}
-
-static void do_nb_verlet(t_forcerec*                fr,
-                         const interaction_const_t* ic,
-                         gmx_enerdata_t*            enerd,
-                         const StepWorkload&        stepWork,
-                         const InteractionLocality  ilocality,
-                         const int                  clearF,
-                         const int64_t              step,
-                         t_nrnb*                    nrnb,
-                         gmx_wallcycle_t            wcycle)
-{
-    if (!stepWork.computeNonbondedForces)
-    {
-        /* skip non-bonded calculation */
-        return;
-    }
-
-    nonbonded_verlet_t* nbv = fr->nbv.get();
-
-    /* GPU kernel launch overhead is already timed separately */
-    if (!nbv->useGpu())
-    {
-        /* When dynamic pair-list  pruning is requested, we need to prune
-         * at nstlistPrune steps.
-         */
-        if (nbv->isDynamicPruningStepCpu(step))
-        {
-            /* Prune the pair-list beyond fr->ic->rlistPrune using
-             * the current coordinates of the atoms.
-             */
-            wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
-            nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec);
-            wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
-        }
-    }
-
-    nbv->dispatchNonbondedKernel(ilocality, *ic, stepWork, clearF, *fr, enerd, nrnb);
-}
-
-static inline void clearRVecs(ArrayRef<RVec> v, const bool useOpenmpThreading)
-{
-    int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, v.ssize());
-
-    /* Note that we would like to avoid this conditional by putting it
-     * into the omp pragma instead, but then we still take the full
-     * omp parallel for overhead (at least with gcc5).
-     */
-    if (!useOpenmpThreading || nth == 1)
-    {
-        for (RVec& elem : v)
-        {
-            clear_rvec(elem);
-        }
-    }
-    else
-    {
-#pragma omp parallel for num_threads(nth) schedule(static)
-        for (gmx::index i = 0; i < v.ssize(); i++)
-        {
-            clear_rvec(v[i]);
-        }
-    }
-}
-
-/*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
- *
- * \param groupOptions  Group options, containing T-coupling options
- */
-static real averageKineticEnergyEstimate(const t_grpopts& groupOptions)
-{
-    real nrdfCoupled   = 0;
-    real nrdfUncoupled = 0;
-    real kineticEnergy = 0;
-    for (int g = 0; g < groupOptions.ngtc; g++)
-    {
-        if (groupOptions.tau_t[g] >= 0)
-        {
-            nrdfCoupled += groupOptions.nrdf[g];
-            kineticEnergy += groupOptions.nrdf[g] * 0.5 * groupOptions.ref_t[g] * BOLTZ;
-        }
-        else
-        {
-            nrdfUncoupled += groupOptions.nrdf[g];
-        }
-    }
-
-    /* This conditional with > also catches nrdf=0 */
-    if (nrdfCoupled > nrdfUncoupled)
-    {
-        return kineticEnergy * (nrdfCoupled + nrdfUncoupled) / nrdfCoupled;
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-/*! \brief This routine checks that the potential energy is finite.
- *
- * Always checks that the potential energy is finite. If step equals
- * inputrec.init_step also checks that the magnitude of the potential energy
- * is reasonable. Terminates with a fatal error when a check fails.
- * Note that passing this check does not guarantee finite forces,
- * since those use slightly different arithmetics. But in most cases
- * there is just a narrow coordinate range where forces are not finite
- * and energies are finite.
- *
- * \param[in] step      The step number, used for checking and printing
- * \param[in] enerd     The energy data; the non-bonded group energies need to be added to
- * enerd.term[F_EPOT] before calling this routine \param[in] inputrec  The input record
- */
-static void checkPotentialEnergyValidity(int64_t step, const gmx_enerdata_t& enerd, const t_inputrec& inputrec)
-{
-    /* Threshold valid for comparing absolute potential energy against
-     * the kinetic energy. Normally one should not consider absolute
-     * potential energy values, but with a factor of one million
-     * we should never get false positives.
-     */
-    constexpr real c_thresholdFactor = 1e6;
-
-    bool energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
-    real averageKineticEnergy = 0;
-    /* We only check for large potential energy at the initial step,
-     * because that is by far the most likely step for this too occur
-     * and because computing the average kinetic energy is not free.
-     * Note: nstcalcenergy >> 1 often does not allow to catch large energies
-     * before they become NaN.
-     */
-    if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
-    {
-        averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
-    }
-
-    if (energyIsNotFinite
-        || (averageKineticEnergy > 0 && enerd.term[F_EPOT] > c_thresholdFactor * averageKineticEnergy))
-    {
-        gmx_fatal(
-                FARGS,
-                "Step %" PRId64
-                ": The total potential energy is %g, which is %s. The LJ and electrostatic "
-                "contributions to the energy are %g and %g, respectively. A %s potential energy "
-                "can be caused by overlapping interactions in bonded interactions or very large%s "
-                "coordinate values. Usually this is caused by a badly- or non-equilibrated initial "
-                "configuration, incorrect interactions or parameters in the topology.",
-                step, enerd.term[F_EPOT], energyIsNotFinite ? "not finite" : "extremely high",
-                enerd.term[F_LJ], enerd.term[F_COUL_SR],
-                energyIsNotFinite ? "non-finite" : "very high", energyIsNotFinite ? " or Nan" : "");
-    }
-}
-
-/*! \brief Return true if there are special forces computed this step.
- *
- * The conditionals exactly correspond to those in computeSpecialForces().
- */
-static bool haveSpecialForces(const t_inputrec&          inputrec,
-                              const gmx::ForceProviders& forceProviders,
-                              const pull_t*              pull_work,
-                              const bool                 computeForces,
-                              const gmx_edsam*           ed)
-{
-
-    return ((computeForces && forceProviders.hasForceProvider()) || // forceProviders
-            (inputrec.bPull && pull_have_potential(*pull_work)) ||  // pull
-            inputrec.bRot ||                                        // enforced rotation
-            (ed != nullptr) ||                                      // flooding
-            (inputrec.bIMD && computeForces));                      // IMD
-}
-
-/*! \brief Compute forces and/or energies for special algorithms
- *
- * The intention is to collect all calls to algorithms that compute
- * forces on local atoms only and that do not contribute to the local
- * virial sum (but add their virial contribution separately).
- * Eventually these should likely all become ForceProviders.
- * Within this function the intention is to have algorithms that do
- * global communication at the end, so global barriers within the MD loop
- * are as close together as possible.
- *
- * \param[in]     fplog            The log file
- * \param[in]     cr               The communication record
- * \param[in]     inputrec         The input record
- * \param[in]     awh              The Awh module (nullptr if none in use).
- * \param[in]     enforcedRotation Enforced rotation module.
- * \param[in]     imdSession       The IMD session
- * \param[in]     pull_work        The pull work structure.
- * \param[in]     step             The current MD step
- * \param[in]     t                The current time
- * \param[in,out] wcycle           Wallcycle accounting struct
- * \param[in,out] forceProviders   Pointer to a list of force providers
- * \param[in]     box              The unit cell
- * \param[in]     x                The coordinates
- * \param[in]     mdatoms          Per atom properties
- * \param[in]     lambda           Array of free-energy lambda values
- * \param[in]     stepWork         Step schedule flags
- * \param[in,out] forceWithVirialMtsLevel0  Force and virial for MTS level0 forces
- * \param[in,out] forceWithVirialMtsLevel1  Force and virial for MTS level1 forces, can be nullptr
- * \param[in,out] enerd            Energy buffer
- * \param[in,out] ed               Essential dynamics pointer
- * \param[in]     didNeighborSearch Tells if we did neighbor searching this step, used for ED sampling
- *
- * \todo Remove didNeighborSearch, which is used incorrectly.
- * \todo Convert all other algorithms called here to ForceProviders.
- */
-static void computeSpecialForces(FILE*                          fplog,
-                                 const t_commrec*               cr,
-                                 const t_inputrec*              inputrec,
-                                 gmx::Awh*                      awh,
-                                 gmx_enfrot*                    enforcedRotation,
-                                 gmx::ImdSession*               imdSession,
-                                 pull_t*                        pull_work,
-                                 int64_t                        step,
-                                 double                         t,
-                                 gmx_wallcycle_t                wcycle,
-                                 gmx::ForceProviders*           forceProviders,
-                                 const matrix                   box,
-                                 gmx::ArrayRef<const gmx::RVec> x,
-                                 const t_mdatoms*               mdatoms,
-                                 gmx::ArrayRef<const real>      lambda,
-                                 const StepWorkload&            stepWork,
-                                 gmx::ForceWithVirial*          forceWithVirialMtsLevel0,
-                                 gmx::ForceWithVirial*          forceWithVirialMtsLevel1,
-                                 gmx_enerdata_t*                enerd,
-                                 gmx_edsam*                     ed,
-                                 bool                           didNeighborSearch)
-{
-    /* NOTE: Currently all ForceProviders only provide forces.
-     *       When they also provide energies, remove this conditional.
-     */
-    if (stepWork.computeForces)
-    {
-        gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
-        gmx::ForceProviderOutput forceProviderOutput(forceWithVirialMtsLevel0, enerd);
-
-        /* Collect forces from modules */
-        forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
-    }
-
-    if (inputrec->bPull && pull_have_potential(*pull_work))
-    {
-        const int mtsLevel = forceGroupMtsLevel(inputrec->mtsLevels, gmx::MtsForceGroups::Pull);
-        if (mtsLevel == 0 || stepWork.computeSlowForces)
-        {
-            auto& forceWithVirial = (mtsLevel == 0) ? forceWithVirialMtsLevel0 : forceWithVirialMtsLevel1;
-            pull_potential_wrapper(cr, inputrec, box, x, forceWithVirial, mdatoms, enerd, pull_work,
-                                   lambda.data(), t, wcycle);
-        }
-    }
-    if (awh)
-    {
-        const int mtsLevel = forceGroupMtsLevel(inputrec->mtsLevels, gmx::MtsForceGroups::Pull);
-        if (mtsLevel == 0 || stepWork.computeSlowForces)
-        {
-            const bool needForeignEnergyDifferences = awh->needForeignEnergyDifferences(step);
-            std::vector<double> foreignLambdaDeltaH, foreignLambdaDhDl;
-            if (needForeignEnergyDifferences)
-            {
-                enerd->foreignLambdaTerms.finalizePotentialContributions(enerd->dvdl_lin, lambda,
-                                                                         *inputrec->fepvals);
-                std::tie(foreignLambdaDeltaH, foreignLambdaDhDl) = enerd->foreignLambdaTerms.getTerms(cr);
-            }
-
-            auto& forceWithVirial = (mtsLevel == 0) ? forceWithVirialMtsLevel0 : forceWithVirialMtsLevel1;
-            enerd->term[F_COM_PULL] += awh->applyBiasForcesAndUpdateBias(
-                    inputrec->pbcType, mdatoms->massT, foreignLambdaDeltaH, foreignLambdaDhDl, box,
-                    forceWithVirial, t, step, wcycle, fplog);
-        }
-    }
-
-    rvec* f = as_rvec_array(forceWithVirialMtsLevel0->force_.data());
-
-    /* Add the forces from enforced rotation potentials (if any) */
-    if (inputrec->bRot)
-    {
-        wallcycle_start(wcycle, ewcROTadd);
-        enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
-        wallcycle_stop(wcycle, ewcROTadd);
-    }
-
-    if (ed)
-    {
-        /* Note that since init_edsam() is called after the initialization
-         * of forcerec, edsam doesn't request the noVirSum force buffer.
-         * Thus if no other algorithm (e.g. PME) requires it, the forces
-         * here will contribute to the virial.
-         */
-        do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, didNeighborSearch);
-    }
-
-    /* Add forces from interactive molecular dynamics (IMD), if any */
-    if (inputrec->bIMD && stepWork.computeForces)
-    {
-        imdSession->applyForces(f);
-    }
-}
-
-/*! \brief Launch the prepare_step and spread stages of PME GPU.
- *
- * \param[in]  pmedata              The PME structure
- * \param[in]  box                  The box matrix
- * \param[in]  stepWork             Step schedule flags
- * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in the device memory.
- * \param[in]  lambdaQ              The Coulomb lambda of the current state.
- * \param[in]  wcycle               The wallcycle structure
- */
-static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
-                                      const matrix          box,
-                                      const StepWorkload&   stepWork,
-                                      GpuEventSynchronizer* xReadyOnDevice,
-                                      const real            lambdaQ,
-                                      gmx_wallcycle_t       wcycle)
-{
-    pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
-    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
-}
-
-/*! \brief Launch the FFT and gather stages of PME GPU
- *
- * This function only implements setting the output forces (no accumulation).
- *
- * \param[in]  pmedata        The PME structure
- * \param[in]  lambdaQ        The Coulomb lambda of the current system state.
- * \param[in]  wcycle         The wallcycle structure
- * \param[in]  stepWork       Step schedule flags
- */
-static void launchPmeGpuFftAndGather(gmx_pme_t*               pmedata,
-                                     const real               lambdaQ,
-                                     gmx_wallcycle_t          wcycle,
-                                     const gmx::StepWorkload& stepWork)
-{
-    pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
-    pme_gpu_launch_gather(pmedata, wcycle, lambdaQ);
-}
-
-/*! \brief
- *  Polling wait for either of the PME or nonbonded GPU tasks.
- *
- * Instead of a static order in waiting for GPU tasks, this function
- * polls checking which of the two tasks completes first, and does the
- * associated force buffer reduction overlapped with the other task.
- * By doing that, unlike static scheduling order, it can always overlap
- * one of the reductions, regardless of the GPU task completion order.
- *
- * \param[in]     nbv              Nonbonded verlet structure
- * \param[in,out] pmedata          PME module data
- * \param[in,out] forceOutputsNonbonded  Force outputs for the non-bonded forces and shift forces
- * \param[in,out] forceOutputsPme  Force outputs for the PME forces and virial
- * \param[in,out] enerd            Energy data structure results are reduced into
- * \param[in]     lambdaQ          The Coulomb lambda of the current system state.
- * \param[in]     stepWork         Step schedule flags
- * \param[in]     wcycle           The wallcycle structure
- */
-static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
-                                        gmx_pme_t*          pmedata,
-                                        gmx::ForceOutputs*  forceOutputsNonbonded,
-                                        gmx::ForceOutputs*  forceOutputsPme,
-                                        gmx_enerdata_t*     enerd,
-                                        const real          lambdaQ,
-                                        const StepWorkload& stepWork,
-                                        gmx_wallcycle_t     wcycle)
-{
-    bool isPmeGpuDone = false;
-    bool isNbGpuDone  = false;
-
-    gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
-
-    while (!isPmeGpuDone || !isNbGpuDone)
-    {
-        if (!isPmeGpuDone)
-        {
-            GpuTaskCompletion completionType =
-                    (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
-            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle,
-                                                   &forceOutputsPme->forceWithVirial(), enerd,
-                                                   lambdaQ, completionType);
-        }
-
-        if (!isNbGpuDone)
-        {
-            auto&             forceBuffersNonbonded = forceOutputsNonbonded->forceWithShiftForces();
-            GpuTaskCompletion completionType =
-                    (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
-            isNbGpuDone = Nbnxm::gpu_try_finish_task(
-                    nbv->gpu_nbv, stepWork, AtomLocality::Local, enerd->grpp.ener[egLJSR].data(),
-                    enerd->grpp.ener[egCOULSR].data(), forceBuffersNonbonded.shiftForces(),
-                    completionType, wcycle);
-
-            if (isNbGpuDone)
-            {
-                nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceBuffersNonbonded.force());
-            }
-        }
-    }
-}
-
-/*! \brief Set up the different force buffers; also does clearing.
- *
- * \param[in] forceHelperBuffers  Helper force buffers
- * \param[in] force     force array
- * \param[in] stepWork  Step schedule flags
- * \param[out] wcycle   wallcycle recording structure
- *
- * \returns             Cleared force output structure
- */
-static ForceOutputs setupForceOutputs(ForceHelperBuffers*                 forceHelperBuffers,
-                                      gmx::ArrayRefWithPadding<gmx::RVec> force,
-                                      const StepWorkload&                 stepWork,
-                                      gmx_wallcycle_t                     wcycle)
-{
-    wallcycle_sub_start(wcycle, ewcsCLEAR_FORCE_BUFFER);
-
-    /* NOTE: We assume fr->shiftForces is all zeros here */
-    gmx::ForceWithShiftForces forceWithShiftForces(force, stepWork.computeVirial,
-                                                   forceHelperBuffers->shiftForces());
-
-    if (stepWork.computeForces)
-    {
-        /* Clear the short- and long-range forces */
-        clearRVecs(forceWithShiftForces.force(), true);
-
-        /* Clear the shift forces */
-        clearRVecs(forceWithShiftForces.shiftForces(), false);
-    }
-
-    /* If we need to compute the virial, we might need a separate
-     * force buffer for algorithms for which the virial is calculated
-     * directly, such as PME. Otherwise, forceWithVirial uses the
-     * the same force (f in legacy calls) buffer as other algorithms.
-     */
-    const bool useSeparateForceWithVirialBuffer =
-            (stepWork.computeForces
-             && (stepWork.computeVirial && forceHelperBuffers->haveDirectVirialContributions()));
-    /* forceWithVirial uses the local atom range only */
-    gmx::ForceWithVirial forceWithVirial(
-            useSeparateForceWithVirialBuffer ? forceHelperBuffers->forceBufferForDirectVirialContributions()
-                                             : force.unpaddedArrayRef(),
-            stepWork.computeVirial);
-
-    if (useSeparateForceWithVirialBuffer)
-    {
-        /* TODO: update comment
-         * We only compute forces on local atoms. Note that vsites can
-         * spread to non-local atoms, but that part of the buffer is
-         * cleared separately in the vsite spreading code.
-         */
-        clearRVecs(forceWithVirial.force_, true);
-    }
-
-    wallcycle_sub_stop(wcycle, ewcsCLEAR_FORCE_BUFFER);
-
-    return ForceOutputs(forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(),
-                        forceWithVirial);
-}
-
-
-/*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
- */
-static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&         inputrec,
-                                                          const t_forcerec&         fr,
-                                                          const pull_t*             pull_work,
-                                                          const gmx_edsam*          ed,
-                                                          const t_mdatoms&          mdatoms,
-                                                          const SimulationWorkload& simulationWork,
-                                                          const StepWorkload&       stepWork)
-{
-    DomainLifetimeWorkload domainWork;
-    // Note that haveSpecialForces is constant over the whole run
-    domainWork.haveSpecialForces =
-            haveSpecialForces(inputrec, *fr.forceProviders, pull_work, stepWork.computeForces, ed);
-    domainWork.haveCpuListedForceWork = false;
-    domainWork.haveCpuBondedWork      = false;
-    for (const auto& listedForces : fr.listedForces)
-    {
-        if (listedForces.haveCpuListedForces(*fr.fcdata))
-        {
-            domainWork.haveCpuListedForceWork = true;
-        }
-        if (listedForces.haveCpuBondeds())
-        {
-            domainWork.haveCpuBondedWork = true;
-        }
-    }
-    domainWork.haveGpuBondedWork = ((fr.gpuBonded != nullptr) && fr.gpuBonded->haveInteractions());
-    // Note that haveFreeEnergyWork is constant over the whole run
-    domainWork.haveFreeEnergyWork = (fr.efep != efepNO && mdatoms.nPerturbed != 0);
-    // We assume we have local force work if there are CPU
-    // force tasks including PME or nonbondeds.
-    domainWork.haveCpuLocalForceWork =
-            domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork
-            || domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme
-            || simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0;
-
-    return domainWork;
-}
-
-/*! \brief Set up force flag stuct from the force bitmask.
- *
- * \param[in]      legacyFlags          Force bitmask flags used to construct the new flags
- * \param[in]      mtsLevels            The multiple time-stepping levels, either empty or 2 levels
- * \param[in]      step                 The current MD step
- * \param[in]      simulationWork       Simulation workload description.
- * \param[in]      rankHasPmeDuty       If this rank computes PME.
- *
- * \returns New Stepworkload description.
- */
-static StepWorkload setupStepWorkload(const int                     legacyFlags,
-                                      ArrayRef<const gmx::MtsLevel> mtsLevels,
-                                      const int64_t                 step,
-                                      const SimulationWorkload&     simulationWork,
-                                      const bool                    rankHasPmeDuty)
-{
-    GMX_ASSERT(mtsLevels.empty() || mtsLevels.size() == 2, "Expect 0 or 2 MTS levels");
-    const bool computeSlowForces = (mtsLevels.empty() || step % mtsLevels[1].stepFactor == 0);
-
-    StepWorkload flags;
-    flags.stateChanged        = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
-    flags.haveDynamicBox      = ((legacyFlags & GMX_FORCE_DYNAMICBOX) != 0);
-    flags.doNeighborSearch    = ((legacyFlags & GMX_FORCE_NS) != 0);
-    flags.computeSlowForces   = computeSlowForces;
-    flags.computeVirial       = ((legacyFlags & GMX_FORCE_VIRIAL) != 0);
-    flags.computeEnergy       = ((legacyFlags & GMX_FORCE_ENERGY) != 0);
-    flags.computeForces       = ((legacyFlags & GMX_FORCE_FORCES) != 0);
-    flags.computeListedForces = ((legacyFlags & GMX_FORCE_LISTED) != 0);
-    flags.computeNonbondedForces =
-            ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && simulationWork.computeNonbonded
-            && !(simulationWork.computeNonbondedAtMtsLevel1 && !computeSlowForces);
-    flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0);
-
-    if (simulationWork.useGpuBufferOps)
-    {
-        GMX_ASSERT(simulationWork.useGpuNonbonded,
-                   "Can only offload buffer ops if nonbonded computation is also offloaded");
-    }
-    flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
-    // on virial steps the CPU reduction path is taken
-    flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
-    flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
-                                && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
-    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
-    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
-
-    return flags;
-}
-
-
-/* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
- *
- * TODO: eliminate \p useGpuPmeOnThisRank when this is
- * incorporated in DomainLifetimeWorkload.
- */
-static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
-                                    gmx::GpuBonded*                   gpuBonded,
-                                    gmx_pme_t*                        pmedata,
-                                    gmx_enerdata_t*                   enerd,
-                                    const gmx::MdrunScheduleWorkload& runScheduleWork,
-                                    bool                              useGpuPmeOnThisRank,
-                                    int64_t                           step,
-                                    gmx_wallcycle_t                   wcycle)
-{
-    if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces)
-    {
-        /* Launch pruning before buffer clearing because the API overhead of the
-         * clear kernel launches can leave the GPU idle while it could be running
-         * the prune kernel.
-         */
-        if (nbv->isDynamicPruningStepGpu(step))
-        {
-            nbv->dispatchPruneKernelGpu(step);
-        }
-
-        /* now clear the GPU outputs while we finish the step on the CPU */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, runScheduleWork.stepWork.computeVirial);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-    }
-
-    if (useGpuPmeOnThisRank)
-    {
-        pme_gpu_reinit_computation(pmedata, wcycle);
-    }
-
-    if (runScheduleWork.domainWork.haveGpuBondedWork && runScheduleWork.stepWork.computeEnergy)
-    {
-        // in principle this should be included in the DD balancing region,
-        // but generally it is infrequent so we'll omit it for the sake of
-        // simpler code
-        gpuBonded->waitAccumulateEnergyTerms(enerd);
-
-        gpuBonded->clearEnergies();
-    }
-}
-
-//! \brief Data structure to hold dipole-related data and staging arrays
-struct DipoleData
-{
-    //! Dipole staging for fast summing over MPI
-    gmx::DVec muStaging[2] = { { 0.0, 0.0, 0.0 } };
-    //! Dipole staging for states A and B (index 0 and 1 resp.)
-    gmx::RVec muStateAB[2] = { { 0.0_real, 0.0_real, 0.0_real } };
-};
-
-
-static void reduceAndUpdateMuTot(DipoleData*                   dipoleData,
-                                 const t_commrec*              cr,
-                                 const bool                    haveFreeEnergy,
-                                 gmx::ArrayRef<const real>     lambda,
-                                 rvec                          muTotal,
-                                 const DDBalanceRegionHandler& ddBalanceRegionHandler)
-{
-    if (PAR(cr))
-    {
-        gmx_sumd(2 * DIM, dipoleData->muStaging[0], cr);
-        ddBalanceRegionHandler.reopenRegionCpu();
-    }
-    for (int i = 0; i < 2; i++)
-    {
-        for (int j = 0; j < DIM; j++)
-        {
-            dipoleData->muStateAB[i][j] = dipoleData->muStaging[i][j];
-        }
-    }
-
-    if (!haveFreeEnergy)
-    {
-        copy_rvec(dipoleData->muStateAB[0], muTotal);
-    }
-    else
-    {
-        for (int j = 0; j < DIM; j++)
-        {
-            muTotal[j] = (1.0 - lambda[efptCOUL]) * dipoleData->muStateAB[0][j]
-                         + lambda[efptCOUL] * dipoleData->muStateAB[1][j];
-        }
-    }
-}
-
-/*! \brief Combines MTS level0 and level1 force buffes into a full and MTS-combined force buffer.
- *
- * \param[in]     numAtoms        The number of atoms to combine forces for
- * \param[in,out] forceMtsLevel0  Input: F_level0, output: F_level0 + F_level1
- * \param[in,out] forceMts        Input: F_level1, output: F_level0 + mtsFactor * F_level1
- * \param[in]     mtsFactor       The factor between the level0 and level1 time step
- */
-static void combineMtsForces(const int      numAtoms,
-                             ArrayRef<RVec> forceMtsLevel0,
-                             ArrayRef<RVec> forceMts,
-                             const real     mtsFactor)
-{
-    const int gmx_unused numThreads = gmx_omp_nthreads_get(emntDefault);
-#pragma omp parallel for num_threads(numThreads) schedule(static)
-    for (int i = 0; i < numAtoms; i++)
-    {
-        const RVec forceMtsLevel0Tmp = forceMtsLevel0[i];
-        forceMtsLevel0[i] += forceMts[i];
-        forceMts[i] = forceMtsLevel0Tmp + mtsFactor * forceMts[i];
-    }
-}
-
-/*! \brief Setup for the local and non-local GPU force reductions:
- * reinitialization plus the registration of forces and dependencies.
- *
- * \param [in] runScheduleWork               Schedule workload flag structure
- * \param [in] cr                            Communication record object
- * \param [in] fr                            Force record object
- */
-static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
-                                    const t_commrec*            cr,
-                                    t_forcerec*                 fr)
-{
-
-    nonbonded_verlet_t*          nbv      = fr->nbv.get();
-    gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
-
-    // (re-)initialize local GPU force reduction
-    const bool accumulate =
-            runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
-    const int atomStart = 0;
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
-            stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::Local), nbv->getGridIndices(),
-            atomStart, accumulate, stateGpu->fReducedOnDevice());
-
-    // register forces and add dependencies
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(nbv->getGpuForces());
-
-    if (runScheduleWork->simulationWork.useGpuPme
-        && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
-    {
-        void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                                       : // PME force buffer on same GPU
-                                 fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
-
-        GpuEventSynchronizer* const pmeSynchronizer =
-                (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
-                                               : // PME force buffer on same GPU
-                         fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
-    }
-
-    if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
-        && !runScheduleWork->simulationWork.useGpuHaloExchange)
-    {
-        auto forcesReadyLocality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
-        const bool useGpuForceBufferOps = true;
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
-                stateGpu->getForcesReadyOnDeviceEvent(forcesReadyLocality, useGpuForceBufferOps));
-    }
-
-    if (runScheduleWork->simulationWork.useGpuHaloExchange)
-    {
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
-                cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
-    }
-
-    if (havePPDomainDecomposition(cr))
-    {
-        // (re-)initialize non-local GPU force reduction
-        const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
-                                || runScheduleWork->domainWork.haveFreeEnergyWork;
-        const int atomStart = dd_numHomeAtoms(*cr->dd);
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
-                stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::NonLocal),
-                nbv->getGridIndices(), atomStart, accumulate);
-
-        // register forces and add dependencies
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(nbv->getGpuForces());
-        if (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork)
-        {
-            fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
-                    stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::NonLocal, true));
-        }
-    }
-}
-
-
-void do_force(FILE*                               fplog,
-              const t_commrec*                    cr,
-              const gmx_multisim_t*               ms,
-              const t_inputrec*                   inputrec,
-              gmx::Awh*                           awh,
-              gmx_enfrot*                         enforcedRotation,
-              gmx::ImdSession*                    imdSession,
-              pull_t*                             pull_work,
-              int64_t                             step,
-              t_nrnb*                             nrnb,
-              gmx_wallcycle_t                     wcycle,
-              const gmx_localtop_t*               top,
-              const matrix                        box,
-              gmx::ArrayRefWithPadding<gmx::RVec> x,
-              history_t*                          hist,
-              gmx::ForceBuffersView*              forceView,
-              tensor                              vir_force,
-              const t_mdatoms*                    mdatoms,
-              gmx_enerdata_t*                     enerd,
-              gmx::ArrayRef<const real>           lambda,
-              t_forcerec*                         fr,
-              gmx::MdrunScheduleWorkload*         runScheduleWork,
-              gmx::VirtualSitesHandler*           vsite,
-              rvec                                muTotal,
-              double                              t,
-              gmx_edsam*                          ed,
-              int                                 legacyFlags,
-              const DDBalanceRegionHandler&       ddBalanceRegionHandler)
-{
-    auto force = forceView->forceWithPadding();
-    GMX_ASSERT(force.unpaddedArrayRef().ssize() >= fr->natoms_force_constr,
-               "The size of the force buffer should be at least the number of atoms to compute "
-               "forces for");
-
-    nonbonded_verlet_t*  nbv = fr->nbv.get();
-    interaction_const_t* ic  = fr->ic;
-
-    gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
-
-    const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
-
-    runScheduleWork->stepWork    = setupStepWorkload(legacyFlags, inputrec->mtsLevels, step,
-                                                  simulationWork, thisRankHasDuty(cr, DUTY_PME));
-    const StepWorkload& stepWork = runScheduleWork->stepWork;
-
-    const bool useGpuPmeOnThisRank =
-            simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces;
-
-    /* At a search step we need to start the first balancing region
-     * somewhere early inside the step after communication during domain
-     * decomposition (and not during the previous step as usual).
-     */
-    if (stepWork.doNeighborSearch)
-    {
-        ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
-    }
-
-    clear_mat(vir_force);
-
-    if (fr->pbcType != PbcType::No)
-    {
-        /* Compute shift vectors every step,
-         * because of pressure coupling or box deformation!
-         */
-        if (stepWork.haveDynamicBox && stepWork.stateChanged)
-        {
-            calc_shifts(box, fr->shift_vec);
-        }
-
-        const bool fillGrid = (stepWork.doNeighborSearch && stepWork.stateChanged);
-        const bool calcCGCM = (fillGrid && !DOMAINDECOMP(cr));
-        if (calcCGCM)
-        {
-            put_atoms_in_box_omp(fr->pbcType, box, x.unpaddedArrayRef().subArray(0, mdatoms->homenr),
-                                 gmx_omp_nthreads_get(emntDefault));
-            inc_nrnb(nrnb, eNR_SHIFTX, mdatoms->homenr);
-        }
-    }
-
-    nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, nbv->nbat.get());
-
-    const bool pmeSendCoordinatesFromGpu =
-            GMX_MPI && simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
-    const bool reinitGpuPmePpComms =
-            GMX_MPI && simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
-
-    const auto localXReadyOnDevice = (useGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
-                                             ? stateGpu->getCoordinatesReadyOnDeviceEvent(
-                                                       AtomLocality::Local, simulationWork, stepWork)
-                                             : nullptr;
-
-    // Copy coordinate from the GPU if update is on the GPU and there
-    // are forces to be computed on the CPU, or for the computation of
-    // virial, or if host-side data will be transferred from this task
-    // to a remote task for halo exchange or PME-PP communication. At
-    // search steps the current coordinates are already on the host,
-    // hence copy is not needed.
-    const bool haveHostPmePpComms =
-            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-
-    GMX_ASSERT(simulationWork.useGpuHaloExchange
-                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
-               "The GPU halo exchange is active, but it has not been constructed.");
-    const bool haveHostHaloExchangeComms =
-            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
-
-    bool gmx_used_in_debug haveCopiedXFromGpu = false;
-    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || haveHostHaloExchangeComms || simulationWork.computeMuTot))
-    {
-        stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
-        haveCopiedXFromGpu = true;
-    }
-
-    // If coordinates are to be sent to PME task from CPU memory, perform that send here.
-    // Otherwise the send will occur after H2D coordinate transfer.
-    if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu && stepWork.computeSlowForces)
-    {
-        /* Send particle coordinates to the pme nodes */
-        if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
-        {
-            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-        }
-
-        gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
-                                 lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
-                                 step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
-                                 pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
-    }
-
-    // Coordinates on the device are needed if PME or BufferOps are offloaded.
-    // The local coordinates can be copied right away.
-    // NOTE: Consider moving this copy to right after they are updated and constrained,
-    //       if the later is not offloaded.
-    if (useGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
-    {
-        if (stepWork.doNeighborSearch)
-        {
-            // TODO refactor this to do_md, after partitioning.
-            stateGpu->reinit(mdatoms->homenr,
-                             cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
-            if (useGpuPmeOnThisRank)
-            {
-                // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
-                pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
-            }
-        }
-        // We need to copy coordinates when:
-        // 1. Update is not offloaded
-        // 2. The buffers were reinitialized on search step
-        if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
-        {
-            GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
-            stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::Local);
-        }
-    }
-
-    // If coordinates are to be sent to PME task from GPU memory, perform that send here.
-    // Otherwise the send will occur before the H2D coordinate transfer.
-    if (!thisRankHasDuty(cr, DUTY_PME) && pmeSendCoordinatesFromGpu)
-    {
-        /* Send particle coordinates to the pme nodes */
-        gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
-                                 lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
-                                 step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
-                                 pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
-    }
-
-    if (useGpuPmeOnThisRank)
-    {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, lambda[efptCOUL], wcycle);
-    }
-
-    const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
-
-    /* do gridding for pair search */
-    if (stepWork.doNeighborSearch)
-    {
-        if (fr->wholeMoleculeTransform && stepWork.stateChanged)
-        {
-            fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
-        }
-
-        // TODO
-        // - vzero is constant, do we need to pass it?
-        // - box_diag should be passed directly to nbnxn_put_on_grid
-        //
-        rvec vzero;
-        clear_rvec(vzero);
-
-        rvec box_diag;
-        box_diag[XX] = box[XX][XX];
-        box_diag[YY] = box[YY][YY];
-        box_diag[ZZ] = box[ZZ][ZZ];
-
-        wallcycle_start(wcycle, ewcNS);
-        if (!DOMAINDECOMP(cr))
-        {
-            wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
-            nbnxn_put_on_grid(nbv, box, 0, vzero, box_diag, nullptr, { 0, mdatoms->homenr }, -1,
-                              fr->cginfo, x.unpaddedArrayRef(), 0, nullptr);
-            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
-        }
-        else
-        {
-            wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
-            nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->cginfo, x.unpaddedArrayRef());
-            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
-        }
-
-        nbv->setAtomProperties(gmx::constArrayRefFromArray(mdatoms->typeA, mdatoms->nr),
-                               gmx::constArrayRefFromArray(mdatoms->chargeA, mdatoms->nr), fr->cginfo);
-
-        wallcycle_stop(wcycle, ewcNS);
-
-        /* initialize the GPU nbnxm atom data and bonded data structures */
-        if (simulationWork.useGpuNonbonded)
-        {
-            // Note: cycle counting only nononbondeds, gpuBonded counts internally
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-            wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-
-            if (fr->gpuBonded)
-            {
-                /* Now we put all atoms on the grid, we can assign bonded
-                 * interactions to the GPU, where the grid order is
-                 * needed. Also the xq, f and fshift device buffers have
-                 * been reallocated if needed, so the bonded code can
-                 * learn about them. */
-                // TODO the xq, f, and fshift buffers are now shared
-                // resources, so they should be maintained by a
-                // higher-level object than the nb module.
-                fr->gpuBonded->updateInteractionListsAndDeviceBuffers(
-                        nbv->getGridIndices(), top->idef, Nbnxm::gpu_get_xq(nbv->gpu_nbv),
-                        Nbnxm::gpu_get_f(nbv->gpu_nbv), Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
-            }
-        }
-
-        // Need to run after the GPU-offload bonded interaction lists
-        // are set up to be able to determine whether there is bonded work.
-        runScheduleWork->domainWork = setupDomainLifetimeWorkload(
-                *inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
-
-        wallcycle_start_nocount(wcycle, ewcNS);
-        wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
-        /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
-        nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
-
-        nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::Local);
-
-        wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
-        wallcycle_stop(wcycle, ewcNS);
-
-        if (stepWork.useGpuXBufferOps)
-        {
-            nbv->atomdata_init_copy_x_to_nbat_x_gpu();
-        }
-
-        if (simulationWork.useGpuBufferOps)
-        {
-            setupGpuForceReductions(runScheduleWork, cr, fr);
-        }
-    }
-    else if (!EI_TPI(inputrec->eI) && stepWork.computeNonbondedForces)
-    {
-        if (stepWork.useGpuXBufferOps)
-        {
-            GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
-            nbv->convertCoordinatesGpu(AtomLocality::Local, false, stateGpu->getCoordinates(),
-                                       localXReadyOnDevice);
-        }
-        else
-        {
-            if (simulationWork.useGpuUpdate)
-            {
-                GMX_ASSERT(stateGpu, "need a valid stateGpu object");
-                GMX_ASSERT(haveCopiedXFromGpu,
-                           "a wait should only be triggered if copy has been scheduled");
-                stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-            }
-            nbv->convertCoordinates(AtomLocality::Local, false, x.unpaddedArrayRef());
-        }
-    }
-
-    if (simulationWork.useGpuNonbonded && (stepWork.computeNonbondedForces || domainWork.haveGpuBondedWork))
-    {
-        ddBalanceRegionHandler.openBeforeForceComputationGpu();
-
-        wallcycle_start(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
-        if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
-        {
-            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
-        }
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-        // with X buffer ops offloaded to the GPU on all but the search steps
-
-        // bonded work not split into separate local and non-local, so with DD
-        // we can only launch the kernel after non-local coordinates have been received.
-        if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
-        {
-            fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
-        }
-
-        /* launch local nonbonded work on GPU */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-    }
-
-    if (useGpuPmeOnThisRank)
-    {
-        // In PME GPU and mixed mode we launch FFT / gather after the
-        // X copy/transform to allow overlap as well as after the GPU NB
-        // launch to avoid FFT launch overhead hijacking the CPU and delaying
-        // the nonbonded kernel.
-        launchPmeGpuFftAndGather(fr->pmedata, lambda[efptCOUL], wcycle, stepWork);
-    }
-
-    /* Communicate coordinates and sum dipole if necessary +
-       do non-local pair search */
-    if (havePPDomainDecomposition(cr))
-    {
-        if (stepWork.doNeighborSearch)
-        {
-            // TODO: fuse this branch with the above large stepWork.doNeighborSearch block
-            wallcycle_start_nocount(wcycle, ewcNS);
-            wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
-            /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
-            nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
-
-            nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
-            wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
-            wallcycle_stop(wcycle, ewcNS);
-            // TODO refactor this GPU halo exchange re-initialisation
-            // to location in do_md where GPU halo exchange is
-            // constructed at partitioning, after above stateGpu
-            // re-initialization has similarly been refactored
-            if (simulationWork.useGpuHaloExchange)
-            {
-                reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
-            }
-        }
-        else
-        {
-            if (stepWork.useGpuXHalo)
-            {
-                // The following must be called after local setCoordinates (which records an event
-                // when the coordinate data has been copied to the device).
-                communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
-
-                if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
-                {
-                    // non-local part of coordinate buffer must be copied back to host for CPU work
-                    stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
-                }
-            }
-            else
-            {
-                if (simulationWork.useGpuUpdate)
-                {
-                    GMX_ASSERT(haveCopiedXFromGpu,
-                               "a wait should only be triggered if copy has been scheduled");
-                    stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-                }
-                dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
-            }
-
-            if (stepWork.useGpuXBufferOps)
-            {
-                if (!useGpuPmeOnThisRank && !stepWork.useGpuXHalo)
-                {
-                    stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
-                }
-                nbv->convertCoordinatesGpu(AtomLocality::NonLocal, false, stateGpu->getCoordinates(),
-                                           stateGpu->getCoordinatesReadyOnDeviceEvent(
-                                                   AtomLocality::NonLocal, simulationWork, stepWork));
-            }
-            else
-            {
-                nbv->convertCoordinates(AtomLocality::NonLocal, false, x.unpaddedArrayRef());
-            }
-        }
-
-        if (simulationWork.useGpuNonbonded)
-        {
-
-            if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
-            {
-                wallcycle_start(wcycle, ewcLAUNCH_GPU);
-                wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-                Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
-                wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-                wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-            }
-
-            if (domainWork.haveGpuBondedWork)
-            {
-                fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
-            }
-
-            /* launch non-local nonbonded tasks on GPU */
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-            wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step,
-                         nrnb, wcycle);
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-        }
-    }
-
-    if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
-    {
-        /* launch D2H copy-back F */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-
-        if (havePPDomainDecomposition(cr))
-        {
-            Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
-        }
-        Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::Local);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-
-        if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
-        {
-            fr->gpuBonded->launchEnergyTransfer();
-        }
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-    }
-
-    gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
-    if (fr->wholeMoleculeTransform)
-    {
-        xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
-    }
-
-    DipoleData dipoleData;
-
-    if (simulationWork.computeMuTot)
-    {
-        const int start = 0;
-
-        if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
-        {
-            GMX_ASSERT(haveCopiedXFromGpu,
-                       "a wait should only be triggered if copy has been scheduled");
-            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-        }
-
-        /* Calculate total (local) dipole moment in a temporary common array.
-         * This makes it possible to sum them over nodes faster.
-         */
-        gmx::ArrayRef<const gmx::RVec> xRef =
-                (xWholeMolecules.empty() ? x.unpaddedArrayRef() : xWholeMolecules);
-        calc_mu(start, mdatoms->homenr, xRef, mdatoms->chargeA, mdatoms->chargeB,
-                mdatoms->nChargePerturbed, dipoleData.muStaging[0], dipoleData.muStaging[1]);
-
-        reduceAndUpdateMuTot(&dipoleData, cr, (fr->efep != efepNO), lambda, muTotal, ddBalanceRegionHandler);
-    }
-
-    /* Reset energies */
-    reset_enerdata(enerd);
-
-    if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
-    {
-        wallcycle_start(wcycle, ewcPPDURINGPME);
-        dd_force_flop_start(cr->dd, nrnb);
-    }
-
-    // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
-    // this wait ensures that the D2H transfer is complete.
-    if ((simulationWork.useGpuUpdate)
-        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
-    {
-        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-    }
-
-    if (inputrec->bRot)
-    {
-        wallcycle_start(wcycle, ewcROT);
-        do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step,
-                    stepWork.doNeighborSearch);
-        wallcycle_stop(wcycle, ewcROT);
-    }
-
-    /* Start the force cycle counter.
-     * Note that a different counter is used for dynamic load balancing.
-     */
-    wallcycle_start(wcycle, ewcFORCE);
-
-    /* Set up and clear force outputs:
-     * forceOutMtsLevel0:  everything except what is in the other two outputs
-     * forceOutMtsLevel1:  PME-mesh and listed-forces group 1
-     * forceOutNonbonded: non-bonded forces
-     * Without multiple time stepping all point to the same object.
-     * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
-     */
-    ForceOutputs forceOutMtsLevel0 =
-            setupForceOutputs(&fr->forceHelperBuffers[0], force, stepWork, wcycle);
-
-    // Force output for MTS combined forces, only set at level1 MTS steps
-    std::optional<ForceOutputs> forceOutMts =
-            (fr->useMts && stepWork.computeSlowForces)
-                    ? std::optional(setupForceOutputs(&fr->forceHelperBuffers[1],
-                                                      forceView->forceMtsCombinedWithPadding(),
-                                                      stepWork, wcycle))
-                    : std::nullopt;
-
-    ForceOutputs* forceOutMtsLevel1 =
-            fr->useMts ? (stepWork.computeSlowForces ? &forceOutMts.value() : nullptr) : &forceOutMtsLevel0;
-
-    const bool nonbondedAtMtsLevel1 = runScheduleWork->simulationWork.computeNonbondedAtMtsLevel1;
-
-    ForceOutputs* forceOutNonbonded = nonbondedAtMtsLevel1 ? forceOutMtsLevel1 : &forceOutMtsLevel0;
-
-    if (inputrec->bPull && pull_have_constraint(*pull_work))
-    {
-        clear_pull_forces(pull_work);
-    }
-
-    /* We calculate the non-bonded forces, when done on the CPU, here.
-     * We do this before calling do_force_lowlevel, because in that
-     * function, the listed forces are calculated before PME, which
-     * does communication.  With this order, non-bonded and listed
-     * force calculation imbalance can be balanced out by the domain
-     * decomposition load balancing.
-     */
-
-    const bool useOrEmulateGpuNb = simulationWork.useGpuNonbonded || fr->nbv->emulateGpu();
-
-    if (!useOrEmulateGpuNb)
-    {
-        do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFYes, step, nrnb, wcycle);
-    }
-
-    if (fr->efep != efepNO && stepWork.computeNonbondedForces)
-    {
-        /* Calculate the local and non-local free energy interactions here.
-         * Happens here on the CPU both with and without GPU.
-         */
-        nbv->dispatchFreeEnergyKernel(InteractionLocality::Local, fr,
-                                      as_rvec_array(x.unpaddedArrayRef().data()),
-                                      &forceOutNonbonded->forceWithShiftForces(), *mdatoms,
-                                      inputrec->fepvals, lambda, enerd, stepWork, nrnb);
-
-        if (havePPDomainDecomposition(cr))
-        {
-            nbv->dispatchFreeEnergyKernel(InteractionLocality::NonLocal, fr,
-                                          as_rvec_array(x.unpaddedArrayRef().data()),
-                                          &forceOutNonbonded->forceWithShiftForces(), *mdatoms,
-                                          inputrec->fepvals, lambda, enerd, stepWork, nrnb);
-        }
-    }
-
-    if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
-    {
-        if (havePPDomainDecomposition(cr))
-        {
-            do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step,
-                         nrnb, wcycle);
-        }
-
-        if (stepWork.computeForces)
-        {
-            /* Add all the non-bonded force to the normal force array.
-             * This can be split into a local and a non-local part when overlapping
-             * communication with calculation with domain decomposition.
-             */
-            wallcycle_stop(wcycle, ewcFORCE);
-            nbv->atomdata_add_nbat_f_to_f(AtomLocality::All,
-                                          forceOutNonbonded->forceWithShiftForces().force());
-            wallcycle_start_nocount(wcycle, ewcFORCE);
-        }
-
-        /* If there are multiple fshift output buffers we need to reduce them */
-        if (stepWork.computeVirial)
-        {
-            /* This is not in a subcounter because it takes a
-               negligible and constant-sized amount of time */
-            nbnxn_atomdata_add_nbat_fshift_to_fshift(
-                    *nbv->nbat, forceOutNonbonded->forceWithShiftForces().shiftForces());
-        }
-    }
-
-    // TODO Force flags should include haveFreeEnergyWork for this domain
-    if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
-    {
-        wallcycle_stop(wcycle, ewcFORCE);
-        /* Wait for non-local coordinate data to be copied from device */
-        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
-        wallcycle_start_nocount(wcycle, ewcFORCE);
-    }
-
-    // Compute wall interactions, when present.
-    // Note: should be moved to special forces.
-    if (inputrec->nwall && stepWork.computeNonbondedForces)
-    {
-        /* foreign lambda component for walls */
-        real dvdl_walls = do_walls(*inputrec, *fr, box, *mdatoms, x.unpaddedConstArrayRef(),
-                                   &forceOutMtsLevel0.forceWithVirial(), lambda[efptVDW],
-                                   enerd->grpp.ener[egLJSR].data(), nrnb);
-        enerd->dvdl_lin[efptVDW] += dvdl_walls;
-    }
-
-    if (stepWork.computeListedForces)
-    {
-        /* Check whether we need to take into account PBC in listed interactions */
-        bool needMolPbc = false;
-        for (const auto& listedForces : fr->listedForces)
-        {
-            if (listedForces.haveCpuListedForces(*fr->fcdata))
-            {
-                needMolPbc = fr->bMolPBC;
-            }
-        }
-
-        t_pbc pbc;
-
-        if (needMolPbc)
-        {
-            /* Since all atoms are in the rectangular or triclinic unit-cell,
-             * only single box vector shifts (2 in x) are required.
-             */
-            set_pbc_dd(&pbc, fr->pbcType, DOMAINDECOMP(cr) ? cr->dd->numCells : nullptr, TRUE, box);
-        }
-
-        for (int mtsIndex = 0; mtsIndex < (fr->useMts && stepWork.computeSlowForces ? 2 : 1); mtsIndex++)
-        {
-            ListedForces& listedForces = fr->listedForces[mtsIndex];
-            ForceOutputs& forceOut     = (mtsIndex == 0 ? forceOutMtsLevel0 : *forceOutMtsLevel1);
-            listedForces.calculate(
-                    wcycle, box, inputrec->fepvals, cr, ms, x, xWholeMolecules, fr->fcdata.get(),
-                    hist, &forceOut, fr, &pbc, enerd, nrnb, lambda.data(), mdatoms,
-                    DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr, stepWork);
-        }
-    }
-
-    if (stepWork.computeSlowForces)
-    {
-        calculateLongRangeNonbondeds(fr, inputrec, cr, nrnb, wcycle, mdatoms,
-                                     x.unpaddedConstArrayRef(), &forceOutMtsLevel1->forceWithVirial(),
-                                     enerd, box, lambda.data(), as_rvec_array(dipoleData.muStateAB),
-                                     stepWork, ddBalanceRegionHandler);
-    }
-
-    /* PLUMED */
-    if(plumedswitch){
-      int plumedNeedsEnergy;
-      plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-      if(!plumedNeedsEnergy) plumed_cmd(plumedmain,"performCalc",nullptr);
-    }
-    /* END PLUMED */ 
-
-    wallcycle_stop(wcycle, ewcFORCE);
-
-    // VdW dispersion correction, only computed on master rank to avoid double counting
-    if ((stepWork.computeEnergy || stepWork.computeVirial) && fr->dispersionCorrection && MASTER(cr))
-    {
-        // Calculate long range corrections to pressure and energy
-        const DispersionCorrection::Correction correction =
-                fr->dispersionCorrection->calculate(box, lambda[efptVDW]);
-
-        if (stepWork.computeEnergy)
-        {
-            enerd->term[F_DISPCORR] = correction.energy;
-            enerd->term[F_DVDL_VDW] += correction.dvdl;
-            enerd->dvdl_lin[efptVDW] += correction.dvdl;
-        }
-        if (stepWork.computeVirial)
-        {
-            correction.correctVirial(vir_force);
-            enerd->term[F_PDISPCORR] = correction.pressure;
-        }
-    }
-
-    const bool needToReceivePmeResultsFromSeparateRank =
-            (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces);
-
-    /* When running free energy perturbations steered by AWH and doing PME calculations on the
-     * GPU we must wait for the PME calculation (dhdl) results to finish before sampling the
-     * FEP dimension with AWH. */
-    const bool needEarlyPmeResults = (awh != nullptr && awh->hasFepLambdaDimension()
-                                      && pme_run_mode(fr->pmedata) != PmeRunMode::None
-                                      && stepWork.computeEnergy && stepWork.computeSlowForces);
-    if (needEarlyPmeResults)
-    {
-        if (useGpuPmeOnThisRank)
-        {
-            pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle,
-                                    &forceOutMtsLevel1->forceWithVirial(), enerd, lambda[efptCOUL]);
-        }
-        else if (needToReceivePmeResultsFromSeparateRank)
-        {
-            /* In case of node-splitting, the PP nodes receive the long-range
-             * forces, virial and energy from the PME nodes here.
-             */
-            pme_receive_force_ener(fr, cr, &forceOutMtsLevel1->forceWithVirial(), enerd,
-                                   simulationWork.useGpuPmePpCommunication,
-                                   stepWork.useGpuPmeFReduction, wcycle);
-        }
-    }
-
-    computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation, imdSession, pull_work, step, t,
-                         wcycle, fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
-                         stepWork, &forceOutMtsLevel0.forceWithVirial(),
-                         forceOutMtsLevel1 ? &forceOutMtsLevel1->forceWithVirial() : nullptr, enerd,
-                         ed, stepWork.doNeighborSearch);
-
-    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
-               "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
-    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo),
-               "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
-    // Will store the amount of cycles spent waiting for the GPU that
-    // will be later used in the DLB accounting.
-    float cycles_wait_gpu = 0;
-    if (useOrEmulateGpuNb && stepWork.computeNonbondedForces)
-    {
-        auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
-
-        /* wait for non-local forces (or calculate in emulation mode) */
-        if (havePPDomainDecomposition(cr))
-        {
-            if (simulationWork.useGpuNonbonded)
-            {
-                cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(
-                        nbv->gpu_nbv, stepWork, AtomLocality::NonLocal, enerd->grpp.ener[egLJSR].data(),
-                        enerd->grpp.ener[egCOULSR].data(), forceWithShiftForces.shiftForces(), wcycle);
-            }
-            else
-            {
-                wallcycle_start_nocount(wcycle, ewcFORCE);
-                do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFYes,
-                             step, nrnb, wcycle);
-                wallcycle_stop(wcycle, ewcFORCE);
-            }
-
-            if (stepWork.useGpuFBufferOps)
-            {
-                // TODO: move this into DomainLifetimeWorkload, including the second part of the
-                // condition The bonded and free energy CPU tasks can have non-local force
-                // contributions which are a dependency for the GPU force reduction.
-                bool haveNonLocalForceContribInCpuBuffer =
-                        domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
-
-                if (haveNonLocalForceContribInCpuBuffer)
-                {
-                    stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
-                                              AtomLocality::NonLocal);
-                }
-
-
-                fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
-
-                if (!stepWork.useGpuFHalo)
-                {
-                    // copy from GPU input for dd_move_f()
-                    stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
-                                                AtomLocality::NonLocal);
-                }
-            }
-            else
-            {
-                nbv->atomdata_add_nbat_f_to_f(AtomLocality::NonLocal, forceWithShiftForces.force());
-            }
-
-            if (fr->nbv->emulateGpu() && stepWork.computeVirial)
-            {
-                nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat, forceWithShiftForces.shiftForces());
-            }
-        }
-    }
-
-    /* Combining the forces for multiple time stepping before the halo exchange, when possible,
-     * avoids an extra halo exchange (when DD is used) and post-processing step.
-     */
-    const bool combineMtsForcesBeforeHaloExchange =
-            (stepWork.computeForces && fr->useMts && stepWork.computeSlowForces
-             && (legacyFlags & GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE) != 0
-             && !(stepWork.computeVirial || simulationWork.useGpuNonbonded || useGpuPmeOnThisRank));
-    if (combineMtsForcesBeforeHaloExchange)
-    {
-        const int numAtoms = havePPDomainDecomposition(cr) ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr;
-        combineMtsForces(numAtoms, force.unpaddedArrayRef(), forceView->forceMtsCombined(),
-                         inputrec->mtsLevels[1].stepFactor);
-    }
-
-    if (havePPDomainDecomposition(cr))
-    {
-        /* We are done with the CPU compute.
-         * We will now communicate the non-local forces.
-         * If we use a GPU this will overlap with GPU work, so in that case
-         * we do not close the DD force balancing region here.
-         */
-        ddBalanceRegionHandler.closeAfterForceComputationCpu();
-
-        if (stepWork.computeForces)
-        {
-
-            if (stepWork.useGpuFHalo)
-            {
-                if (domainWork.haveCpuLocalForceWork)
-                {
-                    stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
-                                              AtomLocality::Local);
-                }
-                communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
-            }
-            else
-            {
-                if (stepWork.useGpuFBufferOps)
-                {
-                    stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
-                }
-
-                // Without MTS or with MTS at slow steps with uncombined forces we need to
-                // communicate the fast forces
-                if (!fr->useMts || !combineMtsForcesBeforeHaloExchange)
-                {
-                    dd_move_f(cr->dd, &forceOutMtsLevel0.forceWithShiftForces(), wcycle);
-                }
-                // With MTS we need to communicate the slow or combined (in forceOutMtsLevel1) forces
-                if (fr->useMts && stepWork.computeSlowForces)
-                {
-                    dd_move_f(cr->dd, &forceOutMtsLevel1->forceWithShiftForces(), wcycle);
-                }
-            }
-        }
-    }
-
-    // With both nonbonded and PME offloaded a GPU on the same rank, we use
-    // an alternating wait/reduction scheme.
-    // When running free energy perturbations steered by AWH and calculating PME on GPU,
-    // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
-    bool alternateGpuWait =
-            (!c_disableAlternatingWait && useGpuPmeOnThisRank && simulationWork.useGpuNonbonded
-             && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps && !needEarlyPmeResults);
-    if (alternateGpuWait)
-    {
-        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, forceOutNonbonded,
-                                    forceOutMtsLevel1, enerd, lambda[efptCOUL], stepWork, wcycle);
-    }
-
-    if (!alternateGpuWait && useGpuPmeOnThisRank && !needEarlyPmeResults)
-    {
-        pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle,
-                                &forceOutMtsLevel1->forceWithVirial(), enerd, lambda[efptCOUL]);
-    }
-
-    /* Wait for local GPU NB outputs on the non-alternating wait path */
-    if (!alternateGpuWait && stepWork.computeNonbondedForces && simulationWork.useGpuNonbonded)
-    {
-        /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
-         * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
-         * but even with a step of 0.1 ms the difference is less than 1%
-         * of the step time.
-         */
-        const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
-        const float waitCycles               = Nbnxm::gpu_wait_finish_task(
-                nbv->gpu_nbv, stepWork, AtomLocality::Local, enerd->grpp.ener[egLJSR].data(),
-                enerd->grpp.ener[egCOULSR].data(),
-                forceOutNonbonded->forceWithShiftForces().shiftForces(), wcycle);
-
-        if (ddBalanceRegionHandler.useBalancingRegion())
-        {
-            DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
-            if (stepWork.computeForces && waitCycles <= gpuWaitApiOverheadMargin)
-            {
-                /* We measured few cycles, it could be that the kernel
-                 * and transfer finished earlier and there was no actual
-                 * wait time, only API call overhead.
-                 * Then the actual time could be anywhere between 0 and
-                 * cycles_wait_est. We will use half of cycles_wait_est.
-                 */
-                waitedForGpu = DdBalanceRegionWaitedForGpu::no;
-            }
-            ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
-        }
-    }
-
-    if (fr->nbv->emulateGpu())
-    {
-        // NOTE: emulation kernel is not included in the balancing region,
-        // but emulation mode does not target performance anyway
-        wallcycle_start_nocount(wcycle, ewcFORCE);
-        do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local,
-                     DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes, step, nrnb, wcycle);
-        wallcycle_stop(wcycle, ewcFORCE);
-    }
-
-    // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
-    // TODO refactor this and unify with below default-path call to the same function
-    // When running free energy perturbations steered by AWH and calculating PME on GPU,
-    // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
-    if (needToReceivePmeResultsFromSeparateRank && simulationWork.useGpuPmePpCommunication && !needEarlyPmeResults)
-    {
-        /* In case of node-splitting, the PP nodes receive the long-range
-         * forces, virial and energy from the PME nodes here.
-         */
-        pme_receive_force_ener(fr, cr, &forceOutMtsLevel1->forceWithVirial(), enerd,
-                               simulationWork.useGpuPmePpCommunication,
-                               stepWork.useGpuPmeFReduction, wcycle);
-    }
-
-
-    /* Do the nonbonded GPU (or emulation) force buffer reduction
-     * on the non-alternating path. */
-    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
-               "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
-    if (useOrEmulateGpuNb && !alternateGpuWait)
-    {
-        if (stepWork.useGpuFBufferOps)
-        {
-            ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
-
-            // Flag to specify whether the CPU force buffer has contributions to
-            // local atoms. This depends on whether there are CPU-based force tasks
-            // or when DD is active the halo exchange has resulted in contributions
-            // from the non-local part.
-            const bool haveLocalForceContribInCpuBuffer =
-                    (domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr));
-
-            // TODO: move these steps as early as possible:
-            // - CPU f H2D should be as soon as all CPU-side forces are done
-            // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
-            //   before the next CPU task that consumes the forces: vsite spread or update)
-            // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
-            //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
-            //   These should be unified.
-            if (haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
-            {
-                // Note: AtomLocality::All is used for the non-DD case because, as in this
-                // case copyForcesToGpu() uses a separate stream, it allows overlap of
-                // CPU force H2D with GPU force tasks on all streams including those in the
-                // local stream which would otherwise be implicit dependencies for the
-                // transfer and would not overlap.
-                auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
-
-                stateGpu->copyForcesToGpu(forceWithShift, locality);
-            }
-
-            if (stepWork.computeNonbondedForces)
-            {
-                fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
-            }
-
-            // Copy forces to host if they are needed for update or if virtual sites are enabled.
-            // If there are vsites, we need to copy forces every step to spread vsite forces on host.
-            // TODO: When the output flags will be included in step workload, this copy can be combined with the
-            //       copy call done in sim_utils(...) for the output.
-            // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
-            //       they should not be copied in do_md(...) for the output.
-            if (!simulationWork.useGpuUpdate
-                || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite)
-            {
-                stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
-                stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
-            }
-        }
-        else if (stepWork.computeNonbondedForces)
-        {
-            ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
-            nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceWithShift);
-        }
-    }
-
-    launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork,
-                            useGpuPmeOnThisRank, step, wcycle);
-
-    if (DOMAINDECOMP(cr))
-    {
-        dd_force_flop_stop(cr->dd, nrnb);
-    }
-
-    const bool haveCombinedMtsForces = (stepWork.computeForces && fr->useMts && stepWork.computeSlowForces
-                                        && combineMtsForcesBeforeHaloExchange);
-    if (stepWork.computeForces)
-    {
-        postProcessForceWithShiftForces(nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutMtsLevel0,
-                                        vir_force, *mdatoms, *fr, vsite, stepWork);
-
-        if (fr->useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
-        {
-            postProcessForceWithShiftForces(nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1,
-                                            vir_force, *mdatoms, *fr, vsite, stepWork);
-        }
-    }
-
-    // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
-    // When running free energy perturbations steered by AWH and calculating PME on GPU,
-    // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
-    if (needToReceivePmeResultsFromSeparateRank && !simulationWork.useGpuPmePpCommunication
-        && !needEarlyPmeResults)
-    {
-        /* In case of node-splitting, the PP nodes receive the long-range
-         * forces, virial and energy from the PME nodes here.
-         */
-        pme_receive_force_ener(fr, cr, &forceOutMtsLevel1->forceWithVirial(), enerd,
-                               simulationWork.useGpuPmePpCommunication, false, wcycle);
-    }
-
-    if (stepWork.computeForces)
-    {
-        /* If we don't use MTS or if we already combined the MTS forces before, we only
-         * need to post-process one ForceOutputs object here, called forceOutCombined,
-         * otherwise we have to post-process two outputs and then combine them.
-         */
-        ForceOutputs& forceOutCombined = (haveCombinedMtsForces ? forceOutMts.value() : forceOutMtsLevel0);
-        postProcessForces(cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutCombined,
-                          vir_force, mdatoms, fr, vsite, stepWork);
-
-        if (fr->useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
-        {
-            postProcessForces(cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1,
-                              vir_force, mdatoms, fr, vsite, stepWork);
-
-            combineMtsForces(mdatoms->homenr, force.unpaddedArrayRef(),
-                             forceView->forceMtsCombined(), inputrec->mtsLevels[1].stepFactor);
-        }
-    }
-
-    if (stepWork.computeEnergy)
-    {
-        /* Compute the final potential energy terms */
-        accumulatePotentialEnergies(enerd, lambda, inputrec->fepvals);
-
-        if (!EI_TPI(inputrec->eI))
-        {
-            checkPotentialEnergyValidity(step, *enerd, *inputrec);
-        }
-    }
-
-    /* In case we don't have constraints and are using GPUs, the next balancing
-     * region starts here.
-     * Some "special" work at the end of do_force_cuts?, such as vsite spread,
-     * virial calculation and COM pulling, is not thus not included in
-     * the balance timing, which is ok as most tasks do communication.
-     */
-    ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
-}
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/sim_util.cpp.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdlib/sim_util.cpp.preplumed
deleted file mode 100644
index 2571b0d216..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdlib/sim_util.cpp.preplumed
+++ /dev/null
@@ -1,2164 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include "config.h"
-
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-
-#include <array>
-#include <optional>
-
-#include "gromacs/applied_forces/awh/awh.h"
-#include "gromacs/domdec/dlbtiming.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/gpuhaloexchange.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/essentialdynamics/edsam.h"
-#include "gromacs/ewald/pme.h"
-#include "gromacs/ewald/pme_pp.h"
-#include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
-#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
-#include "gromacs/gmxlib/nonbonded/nonbonded.h"
-#include "gromacs/gpu_utils/gpu_utils.h"
-#include "gromacs/imd/imd.h"
-#include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/gpubonded.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/listed_forces/orires.h"
-#include "gromacs/math/arrayrefwithpadding.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/math/vecdump.h"
-#include "gromacs/mdlib/calcmu.h"
-#include "gromacs/mdlib/calcvir.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/dispersioncorrection.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdlib/wall.h"
-#include "gromacs/mdlib/wholemoleculetransform.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/enerdata.h"
-#include "gromacs/mdtypes/forcebuffers.h"
-#include "gromacs/mdtypes/forceoutput.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/iforceprovider.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/multipletimestepping.h"
-#include "gromacs/mdtypes/simulation_workload.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/mdtypes/state_propagator_data_gpu.h"
-#include "gromacs/nbnxm/gpu_data_mgmt.h"
-#include "gromacs/nbnxm/nbnxm.h"
-#include "gromacs/nbnxm/nbnxm_gpu.h"
-#include "gromacs/pbcutil/ishift.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/pulling/pull_rotation.h"
-#include "gromacs/timing/cyclecounter.h"
-#include "gromacs/timing/gpu_timing.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/wallcyclereporting.h"
-#include "gromacs/timing/walltime_accounting.h"
-#include "gromacs/topology/topology.h"
-#include "gromacs/utility/arrayref.h"
-#include "gromacs/utility/basedefinitions.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/exceptions.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/fixedcapacityvector.h"
-#include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/smalloc.h"
-#include "gromacs/utility/strconvert.h"
-#include "gromacs/utility/sysinfo.h"
-
-#include "gpuforcereduction.h"
-
-using gmx::ArrayRef;
-using gmx::AtomLocality;
-using gmx::DomainLifetimeWorkload;
-using gmx::ForceOutputs;
-using gmx::ForceWithShiftForces;
-using gmx::InteractionLocality;
-using gmx::RVec;
-using gmx::SimulationWorkload;
-using gmx::StepWorkload;
-
-// TODO: this environment variable allows us to verify before release
-// that on less common architectures the total cost of polling is not larger than
-// a blocking wait (so polling does not introduce overhead when the static
-// PME-first ordering would suffice).
-static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
-
-static void sum_forces(ArrayRef<RVec> f, ArrayRef<const RVec> forceToAdd)
-{
-    GMX_ASSERT(f.size() >= forceToAdd.size(), "Accumulation buffer should be sufficiently large");
-    const int end = forceToAdd.size();
-
-    int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
-#pragma omp parallel for num_threads(nt) schedule(static)
-    for (int i = 0; i < end; i++)
-    {
-        rvec_inc(f[i], forceToAdd[i]);
-    }
-}
-
-static void calc_virial(int                              start,
-                        int                              homenr,
-                        const rvec                       x[],
-                        const gmx::ForceWithShiftForces& forceWithShiftForces,
-                        tensor                           vir_part,
-                        const matrix                     box,
-                        t_nrnb*                          nrnb,
-                        const t_forcerec*                fr,
-                        PbcType                          pbcType)
-{
-    /* The short-range virial from surrounding boxes */
-    const rvec* fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
-    calc_vir(SHIFTS, fr->shift_vec, fshift, vir_part, pbcType == PbcType::Screw, box);
-    inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
-
-    /* Calculate partial virial, for local atoms only, based on short range.
-     * Total virial is computed in global_stat, called from do_md
-     */
-    const rvec* f = as_rvec_array(forceWithShiftForces.force().data());
-    f_calc_vir(start, start + homenr, x, f, vir_part, box);
-    inc_nrnb(nrnb, eNR_VIRIAL, homenr);
-
-    if (debug)
-    {
-        pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
-    }
-}
-
-static void pull_potential_wrapper(const t_commrec*               cr,
-                                   const t_inputrec*              ir,
-                                   const matrix                   box,
-                                   gmx::ArrayRef<const gmx::RVec> x,
-                                   gmx::ForceWithVirial*          force,
-                                   const t_mdatoms*               mdatoms,
-                                   gmx_enerdata_t*                enerd,
-                                   pull_t*                        pull_work,
-                                   const real*                    lambda,
-                                   double                         t,
-                                   gmx_wallcycle_t                wcycle)
-{
-    t_pbc pbc;
-    real  dvdl;
-
-    /* Calculate the center of mass forces, this requires communication,
-     * which is why pull_potential is called close to other communication.
-     */
-    wallcycle_start(wcycle, ewcPULLPOT);
-    set_pbc(&pbc, ir->pbcType, box);
-    dvdl = 0;
-    enerd->term[F_COM_PULL] +=
-            pull_potential(pull_work, mdatoms->massT, &pbc, cr, t, lambda[efptRESTRAINT],
-                           as_rvec_array(x.data()), force, &dvdl);
-    enerd->dvdl_lin[efptRESTRAINT] += dvdl;
-    wallcycle_stop(wcycle, ewcPULLPOT);
-}
-
-static void pme_receive_force_ener(t_forcerec*           fr,
-                                   const t_commrec*      cr,
-                                   gmx::ForceWithVirial* forceWithVirial,
-                                   gmx_enerdata_t*       enerd,
-                                   bool                  useGpuPmePpComms,
-                                   bool                  receivePmeForceToGpu,
-                                   gmx_wallcycle_t       wcycle)
-{
-    real  e_q, e_lj, dvdl_q, dvdl_lj;
-    float cycles_ppdpme, cycles_seppme;
-
-    cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
-    dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
-
-    /* In case of node-splitting, the PP nodes receive the long-range
-     * forces, virial and energy from the PME nodes here.
-     */
-    wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
-    dvdl_q  = 0;
-    dvdl_lj = 0;
-    gmx_pme_receive_f(fr->pmePpCommGpu.get(), cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
-                      useGpuPmePpComms, receivePmeForceToGpu, &cycles_seppme);
-    enerd->term[F_COUL_RECIP] += e_q;
-    enerd->term[F_LJ_RECIP] += e_lj;
-    enerd->dvdl_lin[efptCOUL] += dvdl_q;
-    enerd->dvdl_lin[efptVDW] += dvdl_lj;
-
-    if (wcycle)
-    {
-        dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
-    }
-    wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
-}
-
-static void print_large_forces(FILE*                fp,
-                               const t_mdatoms*     md,
-                               const t_commrec*     cr,
-                               int64_t              step,
-                               real                 forceTolerance,
-                               ArrayRef<const RVec> x,
-                               ArrayRef<const RVec> f)
-{
-    real       force2Tolerance = gmx::square(forceTolerance);
-    gmx::index numNonFinite    = 0;
-    for (int i = 0; i < md->homenr; i++)
-    {
-        real force2    = norm2(f[i]);
-        bool nonFinite = !std::isfinite(force2);
-        if (force2 >= force2Tolerance || nonFinite)
-        {
-            fprintf(fp, "step %" PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n", step,
-                    ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
-        }
-        if (nonFinite)
-        {
-            numNonFinite++;
-        }
-    }
-    if (numNonFinite > 0)
-    {
-        /* Note that with MPI this fatal call on one rank might interrupt
-         * the printing on other ranks. But we can only avoid that with
-         * an expensive MPI barrier that we would need at each step.
-         */
-        gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
-    }
-}
-
-//! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces()
-static void postProcessForceWithShiftForces(t_nrnb*                   nrnb,
-                                            gmx_wallcycle_t           wcycle,
-                                            const matrix              box,
-                                            ArrayRef<const RVec>      x,
-                                            ForceOutputs*             forceOutputs,
-                                            tensor                    vir_force,
-                                            const t_mdatoms&          mdatoms,
-                                            const t_forcerec&         fr,
-                                            gmx::VirtualSitesHandler* vsite,
-                                            const StepWorkload&       stepWork)
-{
-    ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces();
-
-    /* If we have NoVirSum forces, but we do not calculate the virial,
-     * we later sum the forceWithShiftForces buffer together with
-     * the noVirSum buffer and spread the combined vsite forces at once.
-     */
-    if (vsite && (!forceOutputs->haveForceWithVirial() || stepWork.computeVirial))
-    {
-        using VirialHandling = gmx::VirtualSitesHandler::VirialHandling;
-
-        auto                 f      = forceWithShiftForces.force();
-        auto                 fshift = forceWithShiftForces.shiftForces();
-        const VirialHandling virialHandling =
-                (stepWork.computeVirial ? VirialHandling::Pbc : VirialHandling::None);
-        vsite->spreadForces(x, f, virialHandling, fshift, nullptr, nrnb, box, wcycle);
-        forceWithShiftForces.haveSpreadVsiteForces() = true;
-    }
-
-    if (stepWork.computeVirial)
-    {
-        /* Calculation of the virial must be done after vsites! */
-        calc_virial(0, mdatoms.homenr, as_rvec_array(x.data()), forceWithShiftForces, vir_force,
-                    box, nrnb, &fr, fr.pbcType);
-    }
-}
-
-//! Spread, compute virial for and sum forces, when necessary
-static void postProcessForces(const t_commrec*          cr,
-                              int64_t                   step,
-                              t_nrnb*                   nrnb,
-                              gmx_wallcycle_t           wcycle,
-                              const matrix              box,
-                              ArrayRef<const RVec>      x,
-                              ForceOutputs*             forceOutputs,
-                              tensor                    vir_force,
-                              const t_mdatoms*          mdatoms,
-                              const t_forcerec*         fr,
-                              gmx::VirtualSitesHandler* vsite,
-                              const StepWorkload&       stepWork)
-{
-    // Extract the final output force buffer, which is also the buffer for forces with shift forces
-    ArrayRef<RVec> f = forceOutputs->forceWithShiftForces().force();
-
-    if (forceOutputs->haveForceWithVirial())
-    {
-        auto& forceWithVirial = forceOutputs->forceWithVirial();
-
-        if (vsite)
-        {
-            /* Spread the mesh force on virtual sites to the other particles...
-             * This is parallellized. MPI communication is performed
-             * if the constructing atoms aren't local.
-             */
-            GMX_ASSERT(!stepWork.computeVirial || f.data() != forceWithVirial.force_.data(),
-                       "We need separate force buffers for shift and virial forces when "
-                       "computing the virial");
-            GMX_ASSERT(!stepWork.computeVirial
-                               || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
-                       "We should spread the force with shift forces separately when computing "
-                       "the virial");
-            const gmx::VirtualSitesHandler::VirialHandling virialHandling =
-                    (stepWork.computeVirial ? gmx::VirtualSitesHandler::VirialHandling::NonLinear
-                                            : gmx::VirtualSitesHandler::VirialHandling::None);
-            matrix virial = { { 0 } };
-            vsite->spreadForces(x, forceWithVirial.force_, virialHandling, {}, virial, nrnb, box, wcycle);
-            forceWithVirial.addVirialContribution(virial);
-        }
-
-        if (stepWork.computeVirial)
-        {
-            /* Now add the forces, this is local */
-            sum_forces(f, forceWithVirial.force_);
-
-            /* Add the direct virial contributions */
-            GMX_ASSERT(
-                    forceWithVirial.computeVirial_,
-                    "forceWithVirial should request virial computation when we request the virial");
-            m_add(vir_force, forceWithVirial.getVirial(), vir_force);
-
-            if (debug)
-            {
-                pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
-            }
-        }
-    }
-    else
-    {
-        GMX_ASSERT(vsite == nullptr || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
-                   "We should have spread the vsite forces (earlier)");
-    }
-
-    if (fr->print_force >= 0)
-    {
-        print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
-    }
-}
-
-static void do_nb_verlet(t_forcerec*                fr,
-                         const interaction_const_t* ic,
-                         gmx_enerdata_t*            enerd,
-                         const StepWorkload&        stepWork,
-                         const InteractionLocality  ilocality,
-                         const int                  clearF,
-                         const int64_t              step,
-                         t_nrnb*                    nrnb,
-                         gmx_wallcycle_t            wcycle)
-{
-    if (!stepWork.computeNonbondedForces)
-    {
-        /* skip non-bonded calculation */
-        return;
-    }
-
-    nonbonded_verlet_t* nbv = fr->nbv.get();
-
-    /* GPU kernel launch overhead is already timed separately */
-    if (!nbv->useGpu())
-    {
-        /* When dynamic pair-list  pruning is requested, we need to prune
-         * at nstlistPrune steps.
-         */
-        if (nbv->isDynamicPruningStepCpu(step))
-        {
-            /* Prune the pair-list beyond fr->ic->rlistPrune using
-             * the current coordinates of the atoms.
-             */
-            wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
-            nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec);
-            wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
-        }
-    }
-
-    nbv->dispatchNonbondedKernel(ilocality, *ic, stepWork, clearF, *fr, enerd, nrnb);
-}
-
-static inline void clearRVecs(ArrayRef<RVec> v, const bool useOpenmpThreading)
-{
-    int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, v.ssize());
-
-    /* Note that we would like to avoid this conditional by putting it
-     * into the omp pragma instead, but then we still take the full
-     * omp parallel for overhead (at least with gcc5).
-     */
-    if (!useOpenmpThreading || nth == 1)
-    {
-        for (RVec& elem : v)
-        {
-            clear_rvec(elem);
-        }
-    }
-    else
-    {
-#pragma omp parallel for num_threads(nth) schedule(static)
-        for (gmx::index i = 0; i < v.ssize(); i++)
-        {
-            clear_rvec(v[i]);
-        }
-    }
-}
-
-/*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
- *
- * \param groupOptions  Group options, containing T-coupling options
- */
-static real averageKineticEnergyEstimate(const t_grpopts& groupOptions)
-{
-    real nrdfCoupled   = 0;
-    real nrdfUncoupled = 0;
-    real kineticEnergy = 0;
-    for (int g = 0; g < groupOptions.ngtc; g++)
-    {
-        if (groupOptions.tau_t[g] >= 0)
-        {
-            nrdfCoupled += groupOptions.nrdf[g];
-            kineticEnergy += groupOptions.nrdf[g] * 0.5 * groupOptions.ref_t[g] * BOLTZ;
-        }
-        else
-        {
-            nrdfUncoupled += groupOptions.nrdf[g];
-        }
-    }
-
-    /* This conditional with > also catches nrdf=0 */
-    if (nrdfCoupled > nrdfUncoupled)
-    {
-        return kineticEnergy * (nrdfCoupled + nrdfUncoupled) / nrdfCoupled;
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-/*! \brief This routine checks that the potential energy is finite.
- *
- * Always checks that the potential energy is finite. If step equals
- * inputrec.init_step also checks that the magnitude of the potential energy
- * is reasonable. Terminates with a fatal error when a check fails.
- * Note that passing this check does not guarantee finite forces,
- * since those use slightly different arithmetics. But in most cases
- * there is just a narrow coordinate range where forces are not finite
- * and energies are finite.
- *
- * \param[in] step      The step number, used for checking and printing
- * \param[in] enerd     The energy data; the non-bonded group energies need to be added to
- * enerd.term[F_EPOT] before calling this routine \param[in] inputrec  The input record
- */
-static void checkPotentialEnergyValidity(int64_t step, const gmx_enerdata_t& enerd, const t_inputrec& inputrec)
-{
-    /* Threshold valid for comparing absolute potential energy against
-     * the kinetic energy. Normally one should not consider absolute
-     * potential energy values, but with a factor of one million
-     * we should never get false positives.
-     */
-    constexpr real c_thresholdFactor = 1e6;
-
-    bool energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
-    real averageKineticEnergy = 0;
-    /* We only check for large potential energy at the initial step,
-     * because that is by far the most likely step for this too occur
-     * and because computing the average kinetic energy is not free.
-     * Note: nstcalcenergy >> 1 often does not allow to catch large energies
-     * before they become NaN.
-     */
-    if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
-    {
-        averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
-    }
-
-    if (energyIsNotFinite
-        || (averageKineticEnergy > 0 && enerd.term[F_EPOT] > c_thresholdFactor * averageKineticEnergy))
-    {
-        gmx_fatal(
-                FARGS,
-                "Step %" PRId64
-                ": The total potential energy is %g, which is %s. The LJ and electrostatic "
-                "contributions to the energy are %g and %g, respectively. A %s potential energy "
-                "can be caused by overlapping interactions in bonded interactions or very large%s "
-                "coordinate values. Usually this is caused by a badly- or non-equilibrated initial "
-                "configuration, incorrect interactions or parameters in the topology.",
-                step, enerd.term[F_EPOT], energyIsNotFinite ? "not finite" : "extremely high",
-                enerd.term[F_LJ], enerd.term[F_COUL_SR],
-                energyIsNotFinite ? "non-finite" : "very high", energyIsNotFinite ? " or Nan" : "");
-    }
-}
-
-/*! \brief Return true if there are special forces computed this step.
- *
- * The conditionals exactly correspond to those in computeSpecialForces().
- */
-static bool haveSpecialForces(const t_inputrec&          inputrec,
-                              const gmx::ForceProviders& forceProviders,
-                              const pull_t*              pull_work,
-                              const bool                 computeForces,
-                              const gmx_edsam*           ed)
-{
-
-    return ((computeForces && forceProviders.hasForceProvider()) || // forceProviders
-            (inputrec.bPull && pull_have_potential(*pull_work)) ||  // pull
-            inputrec.bRot ||                                        // enforced rotation
-            (ed != nullptr) ||                                      // flooding
-            (inputrec.bIMD && computeForces));                      // IMD
-}
-
-/*! \brief Compute forces and/or energies for special algorithms
- *
- * The intention is to collect all calls to algorithms that compute
- * forces on local atoms only and that do not contribute to the local
- * virial sum (but add their virial contribution separately).
- * Eventually these should likely all become ForceProviders.
- * Within this function the intention is to have algorithms that do
- * global communication at the end, so global barriers within the MD loop
- * are as close together as possible.
- *
- * \param[in]     fplog            The log file
- * \param[in]     cr               The communication record
- * \param[in]     inputrec         The input record
- * \param[in]     awh              The Awh module (nullptr if none in use).
- * \param[in]     enforcedRotation Enforced rotation module.
- * \param[in]     imdSession       The IMD session
- * \param[in]     pull_work        The pull work structure.
- * \param[in]     step             The current MD step
- * \param[in]     t                The current time
- * \param[in,out] wcycle           Wallcycle accounting struct
- * \param[in,out] forceProviders   Pointer to a list of force providers
- * \param[in]     box              The unit cell
- * \param[in]     x                The coordinates
- * \param[in]     mdatoms          Per atom properties
- * \param[in]     lambda           Array of free-energy lambda values
- * \param[in]     stepWork         Step schedule flags
- * \param[in,out] forceWithVirialMtsLevel0  Force and virial for MTS level0 forces
- * \param[in,out] forceWithVirialMtsLevel1  Force and virial for MTS level1 forces, can be nullptr
- * \param[in,out] enerd            Energy buffer
- * \param[in,out] ed               Essential dynamics pointer
- * \param[in]     didNeighborSearch Tells if we did neighbor searching this step, used for ED sampling
- *
- * \todo Remove didNeighborSearch, which is used incorrectly.
- * \todo Convert all other algorithms called here to ForceProviders.
- */
-static void computeSpecialForces(FILE*                          fplog,
-                                 const t_commrec*               cr,
-                                 const t_inputrec*              inputrec,
-                                 gmx::Awh*                      awh,
-                                 gmx_enfrot*                    enforcedRotation,
-                                 gmx::ImdSession*               imdSession,
-                                 pull_t*                        pull_work,
-                                 int64_t                        step,
-                                 double                         t,
-                                 gmx_wallcycle_t                wcycle,
-                                 gmx::ForceProviders*           forceProviders,
-                                 const matrix                   box,
-                                 gmx::ArrayRef<const gmx::RVec> x,
-                                 const t_mdatoms*               mdatoms,
-                                 gmx::ArrayRef<const real>      lambda,
-                                 const StepWorkload&            stepWork,
-                                 gmx::ForceWithVirial*          forceWithVirialMtsLevel0,
-                                 gmx::ForceWithVirial*          forceWithVirialMtsLevel1,
-                                 gmx_enerdata_t*                enerd,
-                                 gmx_edsam*                     ed,
-                                 bool                           didNeighborSearch)
-{
-    /* NOTE: Currently all ForceProviders only provide forces.
-     *       When they also provide energies, remove this conditional.
-     */
-    if (stepWork.computeForces)
-    {
-        gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
-        gmx::ForceProviderOutput forceProviderOutput(forceWithVirialMtsLevel0, enerd);
-
-        /* Collect forces from modules */
-        forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
-    }
-
-    if (inputrec->bPull && pull_have_potential(*pull_work))
-    {
-        const int mtsLevel = forceGroupMtsLevel(inputrec->mtsLevels, gmx::MtsForceGroups::Pull);
-        if (mtsLevel == 0 || stepWork.computeSlowForces)
-        {
-            auto& forceWithVirial = (mtsLevel == 0) ? forceWithVirialMtsLevel0 : forceWithVirialMtsLevel1;
-            pull_potential_wrapper(cr, inputrec, box, x, forceWithVirial, mdatoms, enerd, pull_work,
-                                   lambda.data(), t, wcycle);
-        }
-    }
-    if (awh)
-    {
-        const int mtsLevel = forceGroupMtsLevel(inputrec->mtsLevels, gmx::MtsForceGroups::Pull);
-        if (mtsLevel == 0 || stepWork.computeSlowForces)
-        {
-            const bool needForeignEnergyDifferences = awh->needForeignEnergyDifferences(step);
-            std::vector<double> foreignLambdaDeltaH, foreignLambdaDhDl;
-            if (needForeignEnergyDifferences)
-            {
-                enerd->foreignLambdaTerms.finalizePotentialContributions(enerd->dvdl_lin, lambda,
-                                                                         *inputrec->fepvals);
-                std::tie(foreignLambdaDeltaH, foreignLambdaDhDl) = enerd->foreignLambdaTerms.getTerms(cr);
-            }
-
-            auto& forceWithVirial = (mtsLevel == 0) ? forceWithVirialMtsLevel0 : forceWithVirialMtsLevel1;
-            enerd->term[F_COM_PULL] += awh->applyBiasForcesAndUpdateBias(
-                    inputrec->pbcType, mdatoms->massT, foreignLambdaDeltaH, foreignLambdaDhDl, box,
-                    forceWithVirial, t, step, wcycle, fplog);
-        }
-    }
-
-    rvec* f = as_rvec_array(forceWithVirialMtsLevel0->force_.data());
-
-    /* Add the forces from enforced rotation potentials (if any) */
-    if (inputrec->bRot)
-    {
-        wallcycle_start(wcycle, ewcROTadd);
-        enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, f, cr, step, t);
-        wallcycle_stop(wcycle, ewcROTadd);
-    }
-
-    if (ed)
-    {
-        /* Note that since init_edsam() is called after the initialization
-         * of forcerec, edsam doesn't request the noVirSum force buffer.
-         * Thus if no other algorithm (e.g. PME) requires it, the forces
-         * here will contribute to the virial.
-         */
-        do_flood(cr, inputrec, as_rvec_array(x.data()), f, ed, box, step, didNeighborSearch);
-    }
-
-    /* Add forces from interactive molecular dynamics (IMD), if any */
-    if (inputrec->bIMD && stepWork.computeForces)
-    {
-        imdSession->applyForces(f);
-    }
-}
-
-/*! \brief Launch the prepare_step and spread stages of PME GPU.
- *
- * \param[in]  pmedata              The PME structure
- * \param[in]  box                  The box matrix
- * \param[in]  stepWork             Step schedule flags
- * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in the device memory.
- * \param[in]  lambdaQ              The Coulomb lambda of the current state.
- * \param[in]  wcycle               The wallcycle structure
- */
-static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
-                                      const matrix          box,
-                                      const StepWorkload&   stepWork,
-                                      GpuEventSynchronizer* xReadyOnDevice,
-                                      const real            lambdaQ,
-                                      gmx_wallcycle_t       wcycle)
-{
-    pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
-    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
-}
-
-/*! \brief Launch the FFT and gather stages of PME GPU
- *
- * This function only implements setting the output forces (no accumulation).
- *
- * \param[in]  pmedata        The PME structure
- * \param[in]  lambdaQ        The Coulomb lambda of the current system state.
- * \param[in]  wcycle         The wallcycle structure
- * \param[in]  stepWork       Step schedule flags
- */
-static void launchPmeGpuFftAndGather(gmx_pme_t*               pmedata,
-                                     const real               lambdaQ,
-                                     gmx_wallcycle_t          wcycle,
-                                     const gmx::StepWorkload& stepWork)
-{
-    pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
-    pme_gpu_launch_gather(pmedata, wcycle, lambdaQ);
-}
-
-/*! \brief
- *  Polling wait for either of the PME or nonbonded GPU tasks.
- *
- * Instead of a static order in waiting for GPU tasks, this function
- * polls checking which of the two tasks completes first, and does the
- * associated force buffer reduction overlapped with the other task.
- * By doing that, unlike static scheduling order, it can always overlap
- * one of the reductions, regardless of the GPU task completion order.
- *
- * \param[in]     nbv              Nonbonded verlet structure
- * \param[in,out] pmedata          PME module data
- * \param[in,out] forceOutputsNonbonded  Force outputs for the non-bonded forces and shift forces
- * \param[in,out] forceOutputsPme  Force outputs for the PME forces and virial
- * \param[in,out] enerd            Energy data structure results are reduced into
- * \param[in]     lambdaQ          The Coulomb lambda of the current system state.
- * \param[in]     stepWork         Step schedule flags
- * \param[in]     wcycle           The wallcycle structure
- */
-static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
-                                        gmx_pme_t*          pmedata,
-                                        gmx::ForceOutputs*  forceOutputsNonbonded,
-                                        gmx::ForceOutputs*  forceOutputsPme,
-                                        gmx_enerdata_t*     enerd,
-                                        const real          lambdaQ,
-                                        const StepWorkload& stepWork,
-                                        gmx_wallcycle_t     wcycle)
-{
-    bool isPmeGpuDone = false;
-    bool isNbGpuDone  = false;
-
-    gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
-
-    while (!isPmeGpuDone || !isNbGpuDone)
-    {
-        if (!isPmeGpuDone)
-        {
-            GpuTaskCompletion completionType =
-                    (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
-            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle,
-                                                   &forceOutputsPme->forceWithVirial(), enerd,
-                                                   lambdaQ, completionType);
-        }
-
-        if (!isNbGpuDone)
-        {
-            auto&             forceBuffersNonbonded = forceOutputsNonbonded->forceWithShiftForces();
-            GpuTaskCompletion completionType =
-                    (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
-            isNbGpuDone = Nbnxm::gpu_try_finish_task(
-                    nbv->gpu_nbv, stepWork, AtomLocality::Local, enerd->grpp.ener[egLJSR].data(),
-                    enerd->grpp.ener[egCOULSR].data(), forceBuffersNonbonded.shiftForces(),
-                    completionType, wcycle);
-
-            if (isNbGpuDone)
-            {
-                nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceBuffersNonbonded.force());
-            }
-        }
-    }
-}
-
-/*! \brief Set up the different force buffers; also does clearing.
- *
- * \param[in] forceHelperBuffers  Helper force buffers
- * \param[in] force     force array
- * \param[in] stepWork  Step schedule flags
- * \param[out] wcycle   wallcycle recording structure
- *
- * \returns             Cleared force output structure
- */
-static ForceOutputs setupForceOutputs(ForceHelperBuffers*                 forceHelperBuffers,
-                                      gmx::ArrayRefWithPadding<gmx::RVec> force,
-                                      const StepWorkload&                 stepWork,
-                                      gmx_wallcycle_t                     wcycle)
-{
-    wallcycle_sub_start(wcycle, ewcsCLEAR_FORCE_BUFFER);
-
-    /* NOTE: We assume fr->shiftForces is all zeros here */
-    gmx::ForceWithShiftForces forceWithShiftForces(force, stepWork.computeVirial,
-                                                   forceHelperBuffers->shiftForces());
-
-    if (stepWork.computeForces)
-    {
-        /* Clear the short- and long-range forces */
-        clearRVecs(forceWithShiftForces.force(), true);
-
-        /* Clear the shift forces */
-        clearRVecs(forceWithShiftForces.shiftForces(), false);
-    }
-
-    /* If we need to compute the virial, we might need a separate
-     * force buffer for algorithms for which the virial is calculated
-     * directly, such as PME. Otherwise, forceWithVirial uses the
-     * the same force (f in legacy calls) buffer as other algorithms.
-     */
-    const bool useSeparateForceWithVirialBuffer =
-            (stepWork.computeForces
-             && (stepWork.computeVirial && forceHelperBuffers->haveDirectVirialContributions()));
-    /* forceWithVirial uses the local atom range only */
-    gmx::ForceWithVirial forceWithVirial(
-            useSeparateForceWithVirialBuffer ? forceHelperBuffers->forceBufferForDirectVirialContributions()
-                                             : force.unpaddedArrayRef(),
-            stepWork.computeVirial);
-
-    if (useSeparateForceWithVirialBuffer)
-    {
-        /* TODO: update comment
-         * We only compute forces on local atoms. Note that vsites can
-         * spread to non-local atoms, but that part of the buffer is
-         * cleared separately in the vsite spreading code.
-         */
-        clearRVecs(forceWithVirial.force_, true);
-    }
-
-    wallcycle_sub_stop(wcycle, ewcsCLEAR_FORCE_BUFFER);
-
-    return ForceOutputs(forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(),
-                        forceWithVirial);
-}
-
-
-/*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
- */
-static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&         inputrec,
-                                                          const t_forcerec&         fr,
-                                                          const pull_t*             pull_work,
-                                                          const gmx_edsam*          ed,
-                                                          const t_mdatoms&          mdatoms,
-                                                          const SimulationWorkload& simulationWork,
-                                                          const StepWorkload&       stepWork)
-{
-    DomainLifetimeWorkload domainWork;
-    // Note that haveSpecialForces is constant over the whole run
-    domainWork.haveSpecialForces =
-            haveSpecialForces(inputrec, *fr.forceProviders, pull_work, stepWork.computeForces, ed);
-    domainWork.haveCpuListedForceWork = false;
-    domainWork.haveCpuBondedWork      = false;
-    for (const auto& listedForces : fr.listedForces)
-    {
-        if (listedForces.haveCpuListedForces(*fr.fcdata))
-        {
-            domainWork.haveCpuListedForceWork = true;
-        }
-        if (listedForces.haveCpuBondeds())
-        {
-            domainWork.haveCpuBondedWork = true;
-        }
-    }
-    domainWork.haveGpuBondedWork = ((fr.gpuBonded != nullptr) && fr.gpuBonded->haveInteractions());
-    // Note that haveFreeEnergyWork is constant over the whole run
-    domainWork.haveFreeEnergyWork = (fr.efep != efepNO && mdatoms.nPerturbed != 0);
-    // We assume we have local force work if there are CPU
-    // force tasks including PME or nonbondeds.
-    domainWork.haveCpuLocalForceWork =
-            domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork
-            || domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme
-            || simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0;
-
-    return domainWork;
-}
-
-/*! \brief Set up force flag stuct from the force bitmask.
- *
- * \param[in]      legacyFlags          Force bitmask flags used to construct the new flags
- * \param[in]      mtsLevels            The multiple time-stepping levels, either empty or 2 levels
- * \param[in]      step                 The current MD step
- * \param[in]      simulationWork       Simulation workload description.
- * \param[in]      rankHasPmeDuty       If this rank computes PME.
- *
- * \returns New Stepworkload description.
- */
-static StepWorkload setupStepWorkload(const int                     legacyFlags,
-                                      ArrayRef<const gmx::MtsLevel> mtsLevels,
-                                      const int64_t                 step,
-                                      const SimulationWorkload&     simulationWork,
-                                      const bool                    rankHasPmeDuty)
-{
-    GMX_ASSERT(mtsLevels.empty() || mtsLevels.size() == 2, "Expect 0 or 2 MTS levels");
-    const bool computeSlowForces = (mtsLevels.empty() || step % mtsLevels[1].stepFactor == 0);
-
-    StepWorkload flags;
-    flags.stateChanged        = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
-    flags.haveDynamicBox      = ((legacyFlags & GMX_FORCE_DYNAMICBOX) != 0);
-    flags.doNeighborSearch    = ((legacyFlags & GMX_FORCE_NS) != 0);
-    flags.computeSlowForces   = computeSlowForces;
-    flags.computeVirial       = ((legacyFlags & GMX_FORCE_VIRIAL) != 0);
-    flags.computeEnergy       = ((legacyFlags & GMX_FORCE_ENERGY) != 0);
-    flags.computeForces       = ((legacyFlags & GMX_FORCE_FORCES) != 0);
-    flags.computeListedForces = ((legacyFlags & GMX_FORCE_LISTED) != 0);
-    flags.computeNonbondedForces =
-            ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && simulationWork.computeNonbonded
-            && !(simulationWork.computeNonbondedAtMtsLevel1 && !computeSlowForces);
-    flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0);
-
-    if (simulationWork.useGpuBufferOps)
-    {
-        GMX_ASSERT(simulationWork.useGpuNonbonded,
-                   "Can only offload buffer ops if nonbonded computation is also offloaded");
-    }
-    flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
-    // on virial steps the CPU reduction path is taken
-    flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
-    flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
-                                && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
-    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
-    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
-
-    return flags;
-}
-
-
-/* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
- *
- * TODO: eliminate \p useGpuPmeOnThisRank when this is
- * incorporated in DomainLifetimeWorkload.
- */
-static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
-                                    gmx::GpuBonded*                   gpuBonded,
-                                    gmx_pme_t*                        pmedata,
-                                    gmx_enerdata_t*                   enerd,
-                                    const gmx::MdrunScheduleWorkload& runScheduleWork,
-                                    bool                              useGpuPmeOnThisRank,
-                                    int64_t                           step,
-                                    gmx_wallcycle_t                   wcycle)
-{
-    if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces)
-    {
-        /* Launch pruning before buffer clearing because the API overhead of the
-         * clear kernel launches can leave the GPU idle while it could be running
-         * the prune kernel.
-         */
-        if (nbv->isDynamicPruningStepGpu(step))
-        {
-            nbv->dispatchPruneKernelGpu(step);
-        }
-
-        /* now clear the GPU outputs while we finish the step on the CPU */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, runScheduleWork.stepWork.computeVirial);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-    }
-
-    if (useGpuPmeOnThisRank)
-    {
-        pme_gpu_reinit_computation(pmedata, wcycle);
-    }
-
-    if (runScheduleWork.domainWork.haveGpuBondedWork && runScheduleWork.stepWork.computeEnergy)
-    {
-        // in principle this should be included in the DD balancing region,
-        // but generally it is infrequent so we'll omit it for the sake of
-        // simpler code
-        gpuBonded->waitAccumulateEnergyTerms(enerd);
-
-        gpuBonded->clearEnergies();
-    }
-}
-
-//! \brief Data structure to hold dipole-related data and staging arrays
-struct DipoleData
-{
-    //! Dipole staging for fast summing over MPI
-    gmx::DVec muStaging[2] = { { 0.0, 0.0, 0.0 } };
-    //! Dipole staging for states A and B (index 0 and 1 resp.)
-    gmx::RVec muStateAB[2] = { { 0.0_real, 0.0_real, 0.0_real } };
-};
-
-
-static void reduceAndUpdateMuTot(DipoleData*                   dipoleData,
-                                 const t_commrec*              cr,
-                                 const bool                    haveFreeEnergy,
-                                 gmx::ArrayRef<const real>     lambda,
-                                 rvec                          muTotal,
-                                 const DDBalanceRegionHandler& ddBalanceRegionHandler)
-{
-    if (PAR(cr))
-    {
-        gmx_sumd(2 * DIM, dipoleData->muStaging[0], cr);
-        ddBalanceRegionHandler.reopenRegionCpu();
-    }
-    for (int i = 0; i < 2; i++)
-    {
-        for (int j = 0; j < DIM; j++)
-        {
-            dipoleData->muStateAB[i][j] = dipoleData->muStaging[i][j];
-        }
-    }
-
-    if (!haveFreeEnergy)
-    {
-        copy_rvec(dipoleData->muStateAB[0], muTotal);
-    }
-    else
-    {
-        for (int j = 0; j < DIM; j++)
-        {
-            muTotal[j] = (1.0 - lambda[efptCOUL]) * dipoleData->muStateAB[0][j]
-                         + lambda[efptCOUL] * dipoleData->muStateAB[1][j];
-        }
-    }
-}
-
-/*! \brief Combines MTS level0 and level1 force buffes into a full and MTS-combined force buffer.
- *
- * \param[in]     numAtoms        The number of atoms to combine forces for
- * \param[in,out] forceMtsLevel0  Input: F_level0, output: F_level0 + F_level1
- * \param[in,out] forceMts        Input: F_level1, output: F_level0 + mtsFactor * F_level1
- * \param[in]     mtsFactor       The factor between the level0 and level1 time step
- */
-static void combineMtsForces(const int      numAtoms,
-                             ArrayRef<RVec> forceMtsLevel0,
-                             ArrayRef<RVec> forceMts,
-                             const real     mtsFactor)
-{
-    const int gmx_unused numThreads = gmx_omp_nthreads_get(emntDefault);
-#pragma omp parallel for num_threads(numThreads) schedule(static)
-    for (int i = 0; i < numAtoms; i++)
-    {
-        const RVec forceMtsLevel0Tmp = forceMtsLevel0[i];
-        forceMtsLevel0[i] += forceMts[i];
-        forceMts[i] = forceMtsLevel0Tmp + mtsFactor * forceMts[i];
-    }
-}
-
-/*! \brief Setup for the local and non-local GPU force reductions:
- * reinitialization plus the registration of forces and dependencies.
- *
- * \param [in] runScheduleWork               Schedule workload flag structure
- * \param [in] cr                            Communication record object
- * \param [in] fr                            Force record object
- */
-static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
-                                    const t_commrec*            cr,
-                                    t_forcerec*                 fr)
-{
-
-    nonbonded_verlet_t*          nbv      = fr->nbv.get();
-    gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
-
-    // (re-)initialize local GPU force reduction
-    const bool accumulate =
-            runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
-    const int atomStart = 0;
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
-            stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::Local), nbv->getGridIndices(),
-            atomStart, accumulate, stateGpu->fReducedOnDevice());
-
-    // register forces and add dependencies
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(nbv->getGpuForces());
-
-    if (runScheduleWork->simulationWork.useGpuPme
-        && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
-    {
-        void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                                       : // PME force buffer on same GPU
-                                 fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
-
-        GpuEventSynchronizer* const pmeSynchronizer =
-                (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
-                                               : // PME force buffer on same GPU
-                         fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
-    }
-
-    if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
-        && !runScheduleWork->simulationWork.useGpuHaloExchange)
-    {
-        auto forcesReadyLocality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
-        const bool useGpuForceBufferOps = true;
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
-                stateGpu->getForcesReadyOnDeviceEvent(forcesReadyLocality, useGpuForceBufferOps));
-    }
-
-    if (runScheduleWork->simulationWork.useGpuHaloExchange)
-    {
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
-                cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
-    }
-
-    if (havePPDomainDecomposition(cr))
-    {
-        // (re-)initialize non-local GPU force reduction
-        const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
-                                || runScheduleWork->domainWork.haveFreeEnergyWork;
-        const int atomStart = dd_numHomeAtoms(*cr->dd);
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
-                stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::NonLocal),
-                nbv->getGridIndices(), atomStart, accumulate);
-
-        // register forces and add dependencies
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(nbv->getGpuForces());
-        if (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork)
-        {
-            fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
-                    stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::NonLocal, true));
-        }
-    }
-}
-
-
-void do_force(FILE*                               fplog,
-              const t_commrec*                    cr,
-              const gmx_multisim_t*               ms,
-              const t_inputrec*                   inputrec,
-              gmx::Awh*                           awh,
-              gmx_enfrot*                         enforcedRotation,
-              gmx::ImdSession*                    imdSession,
-              pull_t*                             pull_work,
-              int64_t                             step,
-              t_nrnb*                             nrnb,
-              gmx_wallcycle_t                     wcycle,
-              const gmx_localtop_t*               top,
-              const matrix                        box,
-              gmx::ArrayRefWithPadding<gmx::RVec> x,
-              history_t*                          hist,
-              gmx::ForceBuffersView*              forceView,
-              tensor                              vir_force,
-              const t_mdatoms*                    mdatoms,
-              gmx_enerdata_t*                     enerd,
-              gmx::ArrayRef<const real>           lambda,
-              t_forcerec*                         fr,
-              gmx::MdrunScheduleWorkload*         runScheduleWork,
-              gmx::VirtualSitesHandler*           vsite,
-              rvec                                muTotal,
-              double                              t,
-              gmx_edsam*                          ed,
-              int                                 legacyFlags,
-              const DDBalanceRegionHandler&       ddBalanceRegionHandler)
-{
-    auto force = forceView->forceWithPadding();
-    GMX_ASSERT(force.unpaddedArrayRef().ssize() >= fr->natoms_force_constr,
-               "The size of the force buffer should be at least the number of atoms to compute "
-               "forces for");
-
-    nonbonded_verlet_t*  nbv = fr->nbv.get();
-    interaction_const_t* ic  = fr->ic;
-
-    gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
-
-    const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
-
-    runScheduleWork->stepWork    = setupStepWorkload(legacyFlags, inputrec->mtsLevels, step,
-                                                  simulationWork, thisRankHasDuty(cr, DUTY_PME));
-    const StepWorkload& stepWork = runScheduleWork->stepWork;
-
-    const bool useGpuPmeOnThisRank =
-            simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces;
-
-    /* At a search step we need to start the first balancing region
-     * somewhere early inside the step after communication during domain
-     * decomposition (and not during the previous step as usual).
-     */
-    if (stepWork.doNeighborSearch)
-    {
-        ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
-    }
-
-    clear_mat(vir_force);
-
-    if (fr->pbcType != PbcType::No)
-    {
-        /* Compute shift vectors every step,
-         * because of pressure coupling or box deformation!
-         */
-        if (stepWork.haveDynamicBox && stepWork.stateChanged)
-        {
-            calc_shifts(box, fr->shift_vec);
-        }
-
-        const bool fillGrid = (stepWork.doNeighborSearch && stepWork.stateChanged);
-        const bool calcCGCM = (fillGrid && !DOMAINDECOMP(cr));
-        if (calcCGCM)
-        {
-            put_atoms_in_box_omp(fr->pbcType, box, x.unpaddedArrayRef().subArray(0, mdatoms->homenr),
-                                 gmx_omp_nthreads_get(emntDefault));
-            inc_nrnb(nrnb, eNR_SHIFTX, mdatoms->homenr);
-        }
-    }
-
-    nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, nbv->nbat.get());
-
-    const bool pmeSendCoordinatesFromGpu =
-            GMX_MPI && simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
-    const bool reinitGpuPmePpComms =
-            GMX_MPI && simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
-
-    const auto localXReadyOnDevice = (useGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
-                                             ? stateGpu->getCoordinatesReadyOnDeviceEvent(
-                                                       AtomLocality::Local, simulationWork, stepWork)
-                                             : nullptr;
-
-    // Copy coordinate from the GPU if update is on the GPU and there
-    // are forces to be computed on the CPU, or for the computation of
-    // virial, or if host-side data will be transferred from this task
-    // to a remote task for halo exchange or PME-PP communication. At
-    // search steps the current coordinates are already on the host,
-    // hence copy is not needed.
-    const bool haveHostPmePpComms =
-            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-
-    GMX_ASSERT(simulationWork.useGpuHaloExchange
-                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
-               "The GPU halo exchange is active, but it has not been constructed.");
-    const bool haveHostHaloExchangeComms =
-            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
-
-    bool gmx_used_in_debug haveCopiedXFromGpu = false;
-    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || haveHostHaloExchangeComms || simulationWork.computeMuTot))
-    {
-        stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
-        haveCopiedXFromGpu = true;
-    }
-
-    // If coordinates are to be sent to PME task from CPU memory, perform that send here.
-    // Otherwise the send will occur after H2D coordinate transfer.
-    if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu && stepWork.computeSlowForces)
-    {
-        /* Send particle coordinates to the pme nodes */
-        if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
-        {
-            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-        }
-
-        gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
-                                 lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
-                                 step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
-                                 pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
-    }
-
-    // Coordinates on the device are needed if PME or BufferOps are offloaded.
-    // The local coordinates can be copied right away.
-    // NOTE: Consider moving this copy to right after they are updated and constrained,
-    //       if the later is not offloaded.
-    if (useGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
-    {
-        if (stepWork.doNeighborSearch)
-        {
-            // TODO refactor this to do_md, after partitioning.
-            stateGpu->reinit(mdatoms->homenr,
-                             cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
-            if (useGpuPmeOnThisRank)
-            {
-                // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
-                pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
-            }
-        }
-        // We need to copy coordinates when:
-        // 1. Update is not offloaded
-        // 2. The buffers were reinitialized on search step
-        if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
-        {
-            GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
-            stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::Local);
-        }
-    }
-
-    // If coordinates are to be sent to PME task from GPU memory, perform that send here.
-    // Otherwise the send will occur before the H2D coordinate transfer.
-    if (!thisRankHasDuty(cr, DUTY_PME) && pmeSendCoordinatesFromGpu)
-    {
-        /* Send particle coordinates to the pme nodes */
-        gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
-                                 lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
-                                 step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
-                                 pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
-    }
-
-    if (useGpuPmeOnThisRank)
-    {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, lambda[efptCOUL], wcycle);
-    }
-
-    const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
-
-    /* do gridding for pair search */
-    if (stepWork.doNeighborSearch)
-    {
-        if (fr->wholeMoleculeTransform && stepWork.stateChanged)
-        {
-            fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
-        }
-
-        // TODO
-        // - vzero is constant, do we need to pass it?
-        // - box_diag should be passed directly to nbnxn_put_on_grid
-        //
-        rvec vzero;
-        clear_rvec(vzero);
-
-        rvec box_diag;
-        box_diag[XX] = box[XX][XX];
-        box_diag[YY] = box[YY][YY];
-        box_diag[ZZ] = box[ZZ][ZZ];
-
-        wallcycle_start(wcycle, ewcNS);
-        if (!DOMAINDECOMP(cr))
-        {
-            wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
-            nbnxn_put_on_grid(nbv, box, 0, vzero, box_diag, nullptr, { 0, mdatoms->homenr }, -1,
-                              fr->cginfo, x.unpaddedArrayRef(), 0, nullptr);
-            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
-        }
-        else
-        {
-            wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
-            nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->cginfo, x.unpaddedArrayRef());
-            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
-        }
-
-        nbv->setAtomProperties(gmx::constArrayRefFromArray(mdatoms->typeA, mdatoms->nr),
-                               gmx::constArrayRefFromArray(mdatoms->chargeA, mdatoms->nr), fr->cginfo);
-
-        wallcycle_stop(wcycle, ewcNS);
-
-        /* initialize the GPU nbnxm atom data and bonded data structures */
-        if (simulationWork.useGpuNonbonded)
-        {
-            // Note: cycle counting only nononbondeds, gpuBonded counts internally
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-            wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-
-            if (fr->gpuBonded)
-            {
-                /* Now we put all atoms on the grid, we can assign bonded
-                 * interactions to the GPU, where the grid order is
-                 * needed. Also the xq, f and fshift device buffers have
-                 * been reallocated if needed, so the bonded code can
-                 * learn about them. */
-                // TODO the xq, f, and fshift buffers are now shared
-                // resources, so they should be maintained by a
-                // higher-level object than the nb module.
-                fr->gpuBonded->updateInteractionListsAndDeviceBuffers(
-                        nbv->getGridIndices(), top->idef, Nbnxm::gpu_get_xq(nbv->gpu_nbv),
-                        Nbnxm::gpu_get_f(nbv->gpu_nbv), Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
-            }
-        }
-
-        // Need to run after the GPU-offload bonded interaction lists
-        // are set up to be able to determine whether there is bonded work.
-        runScheduleWork->domainWork = setupDomainLifetimeWorkload(
-                *inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
-
-        wallcycle_start_nocount(wcycle, ewcNS);
-        wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
-        /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
-        nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
-
-        nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::Local);
-
-        wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
-        wallcycle_stop(wcycle, ewcNS);
-
-        if (stepWork.useGpuXBufferOps)
-        {
-            nbv->atomdata_init_copy_x_to_nbat_x_gpu();
-        }
-
-        if (simulationWork.useGpuBufferOps)
-        {
-            setupGpuForceReductions(runScheduleWork, cr, fr);
-        }
-    }
-    else if (!EI_TPI(inputrec->eI) && stepWork.computeNonbondedForces)
-    {
-        if (stepWork.useGpuXBufferOps)
-        {
-            GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
-            nbv->convertCoordinatesGpu(AtomLocality::Local, false, stateGpu->getCoordinates(),
-                                       localXReadyOnDevice);
-        }
-        else
-        {
-            if (simulationWork.useGpuUpdate)
-            {
-                GMX_ASSERT(stateGpu, "need a valid stateGpu object");
-                GMX_ASSERT(haveCopiedXFromGpu,
-                           "a wait should only be triggered if copy has been scheduled");
-                stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-            }
-            nbv->convertCoordinates(AtomLocality::Local, false, x.unpaddedArrayRef());
-        }
-    }
-
-    if (simulationWork.useGpuNonbonded && (stepWork.computeNonbondedForces || domainWork.haveGpuBondedWork))
-    {
-        ddBalanceRegionHandler.openBeforeForceComputationGpu();
-
-        wallcycle_start(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
-        if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
-        {
-            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
-        }
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-        // with X buffer ops offloaded to the GPU on all but the search steps
-
-        // bonded work not split into separate local and non-local, so with DD
-        // we can only launch the kernel after non-local coordinates have been received.
-        if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
-        {
-            fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
-        }
-
-        /* launch local nonbonded work on GPU */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-    }
-
-    if (useGpuPmeOnThisRank)
-    {
-        // In PME GPU and mixed mode we launch FFT / gather after the
-        // X copy/transform to allow overlap as well as after the GPU NB
-        // launch to avoid FFT launch overhead hijacking the CPU and delaying
-        // the nonbonded kernel.
-        launchPmeGpuFftAndGather(fr->pmedata, lambda[efptCOUL], wcycle, stepWork);
-    }
-
-    /* Communicate coordinates and sum dipole if necessary +
-       do non-local pair search */
-    if (havePPDomainDecomposition(cr))
-    {
-        if (stepWork.doNeighborSearch)
-        {
-            // TODO: fuse this branch with the above large stepWork.doNeighborSearch block
-            wallcycle_start_nocount(wcycle, ewcNS);
-            wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
-            /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
-            nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
-
-            nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
-            wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
-            wallcycle_stop(wcycle, ewcNS);
-            // TODO refactor this GPU halo exchange re-initialisation
-            // to location in do_md where GPU halo exchange is
-            // constructed at partitioning, after above stateGpu
-            // re-initialization has similarly been refactored
-            if (simulationWork.useGpuHaloExchange)
-            {
-                reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
-            }
-        }
-        else
-        {
-            if (stepWork.useGpuXHalo)
-            {
-                // The following must be called after local setCoordinates (which records an event
-                // when the coordinate data has been copied to the device).
-                communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
-
-                if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
-                {
-                    // non-local part of coordinate buffer must be copied back to host for CPU work
-                    stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
-                }
-            }
-            else
-            {
-                if (simulationWork.useGpuUpdate)
-                {
-                    GMX_ASSERT(haveCopiedXFromGpu,
-                               "a wait should only be triggered if copy has been scheduled");
-                    stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-                }
-                dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
-            }
-
-            if (stepWork.useGpuXBufferOps)
-            {
-                if (!useGpuPmeOnThisRank && !stepWork.useGpuXHalo)
-                {
-                    stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
-                }
-                nbv->convertCoordinatesGpu(AtomLocality::NonLocal, false, stateGpu->getCoordinates(),
-                                           stateGpu->getCoordinatesReadyOnDeviceEvent(
-                                                   AtomLocality::NonLocal, simulationWork, stepWork));
-            }
-            else
-            {
-                nbv->convertCoordinates(AtomLocality::NonLocal, false, x.unpaddedArrayRef());
-            }
-        }
-
-        if (simulationWork.useGpuNonbonded)
-        {
-
-            if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
-            {
-                wallcycle_start(wcycle, ewcLAUNCH_GPU);
-                wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-                Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
-                wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-                wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-            }
-
-            if (domainWork.haveGpuBondedWork)
-            {
-                fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
-            }
-
-            /* launch non-local nonbonded tasks on GPU */
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-            wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step,
-                         nrnb, wcycle);
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-        }
-    }
-
-    if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
-    {
-        /* launch D2H copy-back F */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-
-        if (havePPDomainDecomposition(cr))
-        {
-            Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
-        }
-        Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::Local);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-
-        if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
-        {
-            fr->gpuBonded->launchEnergyTransfer();
-        }
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-    }
-
-    gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
-    if (fr->wholeMoleculeTransform)
-    {
-        xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
-    }
-
-    DipoleData dipoleData;
-
-    if (simulationWork.computeMuTot)
-    {
-        const int start = 0;
-
-        if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
-        {
-            GMX_ASSERT(haveCopiedXFromGpu,
-                       "a wait should only be triggered if copy has been scheduled");
-            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-        }
-
-        /* Calculate total (local) dipole moment in a temporary common array.
-         * This makes it possible to sum them over nodes faster.
-         */
-        gmx::ArrayRef<const gmx::RVec> xRef =
-                (xWholeMolecules.empty() ? x.unpaddedArrayRef() : xWholeMolecules);
-        calc_mu(start, mdatoms->homenr, xRef, mdatoms->chargeA, mdatoms->chargeB,
-                mdatoms->nChargePerturbed, dipoleData.muStaging[0], dipoleData.muStaging[1]);
-
-        reduceAndUpdateMuTot(&dipoleData, cr, (fr->efep != efepNO), lambda, muTotal, ddBalanceRegionHandler);
-    }
-
-    /* Reset energies */
-    reset_enerdata(enerd);
-
-    if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
-    {
-        wallcycle_start(wcycle, ewcPPDURINGPME);
-        dd_force_flop_start(cr->dd, nrnb);
-    }
-
-    // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
-    // this wait ensures that the D2H transfer is complete.
-    if ((simulationWork.useGpuUpdate)
-        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
-    {
-        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-    }
-
-    if (inputrec->bRot)
-    {
-        wallcycle_start(wcycle, ewcROT);
-        do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step,
-                    stepWork.doNeighborSearch);
-        wallcycle_stop(wcycle, ewcROT);
-    }
-
-    /* Start the force cycle counter.
-     * Note that a different counter is used for dynamic load balancing.
-     */
-    wallcycle_start(wcycle, ewcFORCE);
-
-    /* Set up and clear force outputs:
-     * forceOutMtsLevel0:  everything except what is in the other two outputs
-     * forceOutMtsLevel1:  PME-mesh and listed-forces group 1
-     * forceOutNonbonded: non-bonded forces
-     * Without multiple time stepping all point to the same object.
-     * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
-     */
-    ForceOutputs forceOutMtsLevel0 =
-            setupForceOutputs(&fr->forceHelperBuffers[0], force, stepWork, wcycle);
-
-    // Force output for MTS combined forces, only set at level1 MTS steps
-    std::optional<ForceOutputs> forceOutMts =
-            (fr->useMts && stepWork.computeSlowForces)
-                    ? std::optional(setupForceOutputs(&fr->forceHelperBuffers[1],
-                                                      forceView->forceMtsCombinedWithPadding(),
-                                                      stepWork, wcycle))
-                    : std::nullopt;
-
-    ForceOutputs* forceOutMtsLevel1 =
-            fr->useMts ? (stepWork.computeSlowForces ? &forceOutMts.value() : nullptr) : &forceOutMtsLevel0;
-
-    const bool nonbondedAtMtsLevel1 = runScheduleWork->simulationWork.computeNonbondedAtMtsLevel1;
-
-    ForceOutputs* forceOutNonbonded = nonbondedAtMtsLevel1 ? forceOutMtsLevel1 : &forceOutMtsLevel0;
-
-    if (inputrec->bPull && pull_have_constraint(*pull_work))
-    {
-        clear_pull_forces(pull_work);
-    }
-
-    /* We calculate the non-bonded forces, when done on the CPU, here.
-     * We do this before calling do_force_lowlevel, because in that
-     * function, the listed forces are calculated before PME, which
-     * does communication.  With this order, non-bonded and listed
-     * force calculation imbalance can be balanced out by the domain
-     * decomposition load balancing.
-     */
-
-    const bool useOrEmulateGpuNb = simulationWork.useGpuNonbonded || fr->nbv->emulateGpu();
-
-    if (!useOrEmulateGpuNb)
-    {
-        do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFYes, step, nrnb, wcycle);
-    }
-
-    if (fr->efep != efepNO && stepWork.computeNonbondedForces)
-    {
-        /* Calculate the local and non-local free energy interactions here.
-         * Happens here on the CPU both with and without GPU.
-         */
-        nbv->dispatchFreeEnergyKernel(InteractionLocality::Local, fr,
-                                      as_rvec_array(x.unpaddedArrayRef().data()),
-                                      &forceOutNonbonded->forceWithShiftForces(), *mdatoms,
-                                      inputrec->fepvals, lambda, enerd, stepWork, nrnb);
-
-        if (havePPDomainDecomposition(cr))
-        {
-            nbv->dispatchFreeEnergyKernel(InteractionLocality::NonLocal, fr,
-                                          as_rvec_array(x.unpaddedArrayRef().data()),
-                                          &forceOutNonbonded->forceWithShiftForces(), *mdatoms,
-                                          inputrec->fepvals, lambda, enerd, stepWork, nrnb);
-        }
-    }
-
-    if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
-    {
-        if (havePPDomainDecomposition(cr))
-        {
-            do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step,
-                         nrnb, wcycle);
-        }
-
-        if (stepWork.computeForces)
-        {
-            /* Add all the non-bonded force to the normal force array.
-             * This can be split into a local and a non-local part when overlapping
-             * communication with calculation with domain decomposition.
-             */
-            wallcycle_stop(wcycle, ewcFORCE);
-            nbv->atomdata_add_nbat_f_to_f(AtomLocality::All,
-                                          forceOutNonbonded->forceWithShiftForces().force());
-            wallcycle_start_nocount(wcycle, ewcFORCE);
-        }
-
-        /* If there are multiple fshift output buffers we need to reduce them */
-        if (stepWork.computeVirial)
-        {
-            /* This is not in a subcounter because it takes a
-               negligible and constant-sized amount of time */
-            nbnxn_atomdata_add_nbat_fshift_to_fshift(
-                    *nbv->nbat, forceOutNonbonded->forceWithShiftForces().shiftForces());
-        }
-    }
-
-    // TODO Force flags should include haveFreeEnergyWork for this domain
-    if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
-    {
-        wallcycle_stop(wcycle, ewcFORCE);
-        /* Wait for non-local coordinate data to be copied from device */
-        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
-        wallcycle_start_nocount(wcycle, ewcFORCE);
-    }
-
-    // Compute wall interactions, when present.
-    // Note: should be moved to special forces.
-    if (inputrec->nwall && stepWork.computeNonbondedForces)
-    {
-        /* foreign lambda component for walls */
-        real dvdl_walls = do_walls(*inputrec, *fr, box, *mdatoms, x.unpaddedConstArrayRef(),
-                                   &forceOutMtsLevel0.forceWithVirial(), lambda[efptVDW],
-                                   enerd->grpp.ener[egLJSR].data(), nrnb);
-        enerd->dvdl_lin[efptVDW] += dvdl_walls;
-    }
-
-    if (stepWork.computeListedForces)
-    {
-        /* Check whether we need to take into account PBC in listed interactions */
-        bool needMolPbc = false;
-        for (const auto& listedForces : fr->listedForces)
-        {
-            if (listedForces.haveCpuListedForces(*fr->fcdata))
-            {
-                needMolPbc = fr->bMolPBC;
-            }
-        }
-
-        t_pbc pbc;
-
-        if (needMolPbc)
-        {
-            /* Since all atoms are in the rectangular or triclinic unit-cell,
-             * only single box vector shifts (2 in x) are required.
-             */
-            set_pbc_dd(&pbc, fr->pbcType, DOMAINDECOMP(cr) ? cr->dd->numCells : nullptr, TRUE, box);
-        }
-
-        for (int mtsIndex = 0; mtsIndex < (fr->useMts && stepWork.computeSlowForces ? 2 : 1); mtsIndex++)
-        {
-            ListedForces& listedForces = fr->listedForces[mtsIndex];
-            ForceOutputs& forceOut     = (mtsIndex == 0 ? forceOutMtsLevel0 : *forceOutMtsLevel1);
-            listedForces.calculate(
-                    wcycle, box, inputrec->fepvals, cr, ms, x, xWholeMolecules, fr->fcdata.get(),
-                    hist, &forceOut, fr, &pbc, enerd, nrnb, lambda.data(), mdatoms,
-                    DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr, stepWork);
-        }
-    }
-
-    if (stepWork.computeSlowForces)
-    {
-        calculateLongRangeNonbondeds(fr, inputrec, cr, nrnb, wcycle, mdatoms,
-                                     x.unpaddedConstArrayRef(), &forceOutMtsLevel1->forceWithVirial(),
-                                     enerd, box, lambda.data(), as_rvec_array(dipoleData.muStateAB),
-                                     stepWork, ddBalanceRegionHandler);
-    }
-
-    wallcycle_stop(wcycle, ewcFORCE);
-
-    // VdW dispersion correction, only computed on master rank to avoid double counting
-    if ((stepWork.computeEnergy || stepWork.computeVirial) && fr->dispersionCorrection && MASTER(cr))
-    {
-        // Calculate long range corrections to pressure and energy
-        const DispersionCorrection::Correction correction =
-                fr->dispersionCorrection->calculate(box, lambda[efptVDW]);
-
-        if (stepWork.computeEnergy)
-        {
-            enerd->term[F_DISPCORR] = correction.energy;
-            enerd->term[F_DVDL_VDW] += correction.dvdl;
-            enerd->dvdl_lin[efptVDW] += correction.dvdl;
-        }
-        if (stepWork.computeVirial)
-        {
-            correction.correctVirial(vir_force);
-            enerd->term[F_PDISPCORR] = correction.pressure;
-        }
-    }
-
-    const bool needToReceivePmeResultsFromSeparateRank =
-            (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces);
-
-    /* When running free energy perturbations steered by AWH and doing PME calculations on the
-     * GPU we must wait for the PME calculation (dhdl) results to finish before sampling the
-     * FEP dimension with AWH. */
-    const bool needEarlyPmeResults = (awh != nullptr && awh->hasFepLambdaDimension()
-                                      && pme_run_mode(fr->pmedata) != PmeRunMode::None
-                                      && stepWork.computeEnergy && stepWork.computeSlowForces);
-    if (needEarlyPmeResults)
-    {
-        if (useGpuPmeOnThisRank)
-        {
-            pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle,
-                                    &forceOutMtsLevel1->forceWithVirial(), enerd, lambda[efptCOUL]);
-        }
-        else if (needToReceivePmeResultsFromSeparateRank)
-        {
-            /* In case of node-splitting, the PP nodes receive the long-range
-             * forces, virial and energy from the PME nodes here.
-             */
-            pme_receive_force_ener(fr, cr, &forceOutMtsLevel1->forceWithVirial(), enerd,
-                                   simulationWork.useGpuPmePpCommunication,
-                                   stepWork.useGpuPmeFReduction, wcycle);
-        }
-    }
-
-    computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation, imdSession, pull_work, step, t,
-                         wcycle, fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
-                         stepWork, &forceOutMtsLevel0.forceWithVirial(),
-                         forceOutMtsLevel1 ? &forceOutMtsLevel1->forceWithVirial() : nullptr, enerd,
-                         ed, stepWork.doNeighborSearch);
-
-    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
-               "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
-    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo),
-               "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
-    // Will store the amount of cycles spent waiting for the GPU that
-    // will be later used in the DLB accounting.
-    float cycles_wait_gpu = 0;
-    if (useOrEmulateGpuNb && stepWork.computeNonbondedForces)
-    {
-        auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
-
-        /* wait for non-local forces (or calculate in emulation mode) */
-        if (havePPDomainDecomposition(cr))
-        {
-            if (simulationWork.useGpuNonbonded)
-            {
-                cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(
-                        nbv->gpu_nbv, stepWork, AtomLocality::NonLocal, enerd->grpp.ener[egLJSR].data(),
-                        enerd->grpp.ener[egCOULSR].data(), forceWithShiftForces.shiftForces(), wcycle);
-            }
-            else
-            {
-                wallcycle_start_nocount(wcycle, ewcFORCE);
-                do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFYes,
-                             step, nrnb, wcycle);
-                wallcycle_stop(wcycle, ewcFORCE);
-            }
-
-            if (stepWork.useGpuFBufferOps)
-            {
-                // TODO: move this into DomainLifetimeWorkload, including the second part of the
-                // condition The bonded and free energy CPU tasks can have non-local force
-                // contributions which are a dependency for the GPU force reduction.
-                bool haveNonLocalForceContribInCpuBuffer =
-                        domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
-
-                if (haveNonLocalForceContribInCpuBuffer)
-                {
-                    stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
-                                              AtomLocality::NonLocal);
-                }
-
-
-                fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
-
-                if (!stepWork.useGpuFHalo)
-                {
-                    // copy from GPU input for dd_move_f()
-                    stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
-                                                AtomLocality::NonLocal);
-                }
-            }
-            else
-            {
-                nbv->atomdata_add_nbat_f_to_f(AtomLocality::NonLocal, forceWithShiftForces.force());
-            }
-
-            if (fr->nbv->emulateGpu() && stepWork.computeVirial)
-            {
-                nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat, forceWithShiftForces.shiftForces());
-            }
-        }
-    }
-
-    /* Combining the forces for multiple time stepping before the halo exchange, when possible,
-     * avoids an extra halo exchange (when DD is used) and post-processing step.
-     */
-    const bool combineMtsForcesBeforeHaloExchange =
-            (stepWork.computeForces && fr->useMts && stepWork.computeSlowForces
-             && (legacyFlags & GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE) != 0
-             && !(stepWork.computeVirial || simulationWork.useGpuNonbonded || useGpuPmeOnThisRank));
-    if (combineMtsForcesBeforeHaloExchange)
-    {
-        const int numAtoms = havePPDomainDecomposition(cr) ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr;
-        combineMtsForces(numAtoms, force.unpaddedArrayRef(), forceView->forceMtsCombined(),
-                         inputrec->mtsLevels[1].stepFactor);
-    }
-
-    if (havePPDomainDecomposition(cr))
-    {
-        /* We are done with the CPU compute.
-         * We will now communicate the non-local forces.
-         * If we use a GPU this will overlap with GPU work, so in that case
-         * we do not close the DD force balancing region here.
-         */
-        ddBalanceRegionHandler.closeAfterForceComputationCpu();
-
-        if (stepWork.computeForces)
-        {
-
-            if (stepWork.useGpuFHalo)
-            {
-                if (domainWork.haveCpuLocalForceWork)
-                {
-                    stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
-                                              AtomLocality::Local);
-                }
-                communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
-            }
-            else
-            {
-                if (stepWork.useGpuFBufferOps)
-                {
-                    stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
-                }
-
-                // Without MTS or with MTS at slow steps with uncombined forces we need to
-                // communicate the fast forces
-                if (!fr->useMts || !combineMtsForcesBeforeHaloExchange)
-                {
-                    dd_move_f(cr->dd, &forceOutMtsLevel0.forceWithShiftForces(), wcycle);
-                }
-                // With MTS we need to communicate the slow or combined (in forceOutMtsLevel1) forces
-                if (fr->useMts && stepWork.computeSlowForces)
-                {
-                    dd_move_f(cr->dd, &forceOutMtsLevel1->forceWithShiftForces(), wcycle);
-                }
-            }
-        }
-    }
-
-    // With both nonbonded and PME offloaded a GPU on the same rank, we use
-    // an alternating wait/reduction scheme.
-    // When running free energy perturbations steered by AWH and calculating PME on GPU,
-    // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
-    bool alternateGpuWait =
-            (!c_disableAlternatingWait && useGpuPmeOnThisRank && simulationWork.useGpuNonbonded
-             && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps && !needEarlyPmeResults);
-    if (alternateGpuWait)
-    {
-        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, forceOutNonbonded,
-                                    forceOutMtsLevel1, enerd, lambda[efptCOUL], stepWork, wcycle);
-    }
-
-    if (!alternateGpuWait && useGpuPmeOnThisRank && !needEarlyPmeResults)
-    {
-        pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle,
-                                &forceOutMtsLevel1->forceWithVirial(), enerd, lambda[efptCOUL]);
-    }
-
-    /* Wait for local GPU NB outputs on the non-alternating wait path */
-    if (!alternateGpuWait && stepWork.computeNonbondedForces && simulationWork.useGpuNonbonded)
-    {
-        /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
-         * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
-         * but even with a step of 0.1 ms the difference is less than 1%
-         * of the step time.
-         */
-        const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
-        const float waitCycles               = Nbnxm::gpu_wait_finish_task(
-                nbv->gpu_nbv, stepWork, AtomLocality::Local, enerd->grpp.ener[egLJSR].data(),
-                enerd->grpp.ener[egCOULSR].data(),
-                forceOutNonbonded->forceWithShiftForces().shiftForces(), wcycle);
-
-        if (ddBalanceRegionHandler.useBalancingRegion())
-        {
-            DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
-            if (stepWork.computeForces && waitCycles <= gpuWaitApiOverheadMargin)
-            {
-                /* We measured few cycles, it could be that the kernel
-                 * and transfer finished earlier and there was no actual
-                 * wait time, only API call overhead.
-                 * Then the actual time could be anywhere between 0 and
-                 * cycles_wait_est. We will use half of cycles_wait_est.
-                 */
-                waitedForGpu = DdBalanceRegionWaitedForGpu::no;
-            }
-            ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
-        }
-    }
-
-    if (fr->nbv->emulateGpu())
-    {
-        // NOTE: emulation kernel is not included in the balancing region,
-        // but emulation mode does not target performance anyway
-        wallcycle_start_nocount(wcycle, ewcFORCE);
-        do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local,
-                     DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes, step, nrnb, wcycle);
-        wallcycle_stop(wcycle, ewcFORCE);
-    }
-
-    // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
-    // TODO refactor this and unify with below default-path call to the same function
-    // When running free energy perturbations steered by AWH and calculating PME on GPU,
-    // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
-    if (needToReceivePmeResultsFromSeparateRank && simulationWork.useGpuPmePpCommunication && !needEarlyPmeResults)
-    {
-        /* In case of node-splitting, the PP nodes receive the long-range
-         * forces, virial and energy from the PME nodes here.
-         */
-        pme_receive_force_ener(fr, cr, &forceOutMtsLevel1->forceWithVirial(), enerd,
-                               simulationWork.useGpuPmePpCommunication,
-                               stepWork.useGpuPmeFReduction, wcycle);
-    }
-
-
-    /* Do the nonbonded GPU (or emulation) force buffer reduction
-     * on the non-alternating path. */
-    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
-               "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
-    if (useOrEmulateGpuNb && !alternateGpuWait)
-    {
-        if (stepWork.useGpuFBufferOps)
-        {
-            ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
-
-            // Flag to specify whether the CPU force buffer has contributions to
-            // local atoms. This depends on whether there are CPU-based force tasks
-            // or when DD is active the halo exchange has resulted in contributions
-            // from the non-local part.
-            const bool haveLocalForceContribInCpuBuffer =
-                    (domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr));
-
-            // TODO: move these steps as early as possible:
-            // - CPU f H2D should be as soon as all CPU-side forces are done
-            // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
-            //   before the next CPU task that consumes the forces: vsite spread or update)
-            // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
-            //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
-            //   These should be unified.
-            if (haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
-            {
-                // Note: AtomLocality::All is used for the non-DD case because, as in this
-                // case copyForcesToGpu() uses a separate stream, it allows overlap of
-                // CPU force H2D with GPU force tasks on all streams including those in the
-                // local stream which would otherwise be implicit dependencies for the
-                // transfer and would not overlap.
-                auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
-
-                stateGpu->copyForcesToGpu(forceWithShift, locality);
-            }
-
-            if (stepWork.computeNonbondedForces)
-            {
-                fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
-            }
-
-            // Copy forces to host if they are needed for update or if virtual sites are enabled.
-            // If there are vsites, we need to copy forces every step to spread vsite forces on host.
-            // TODO: When the output flags will be included in step workload, this copy can be combined with the
-            //       copy call done in sim_utils(...) for the output.
-            // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
-            //       they should not be copied in do_md(...) for the output.
-            if (!simulationWork.useGpuUpdate
-                || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite)
-            {
-                stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
-                stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
-            }
-        }
-        else if (stepWork.computeNonbondedForces)
-        {
-            ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
-            nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceWithShift);
-        }
-    }
-
-    launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork,
-                            useGpuPmeOnThisRank, step, wcycle);
-
-    if (DOMAINDECOMP(cr))
-    {
-        dd_force_flop_stop(cr->dd, nrnb);
-    }
-
-    const bool haveCombinedMtsForces = (stepWork.computeForces && fr->useMts && stepWork.computeSlowForces
-                                        && combineMtsForcesBeforeHaloExchange);
-    if (stepWork.computeForces)
-    {
-        postProcessForceWithShiftForces(nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutMtsLevel0,
-                                        vir_force, *mdatoms, *fr, vsite, stepWork);
-
-        if (fr->useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
-        {
-            postProcessForceWithShiftForces(nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1,
-                                            vir_force, *mdatoms, *fr, vsite, stepWork);
-        }
-    }
-
-    // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
-    // When running free energy perturbations steered by AWH and calculating PME on GPU,
-    // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
-    if (needToReceivePmeResultsFromSeparateRank && !simulationWork.useGpuPmePpCommunication
-        && !needEarlyPmeResults)
-    {
-        /* In case of node-splitting, the PP nodes receive the long-range
-         * forces, virial and energy from the PME nodes here.
-         */
-        pme_receive_force_ener(fr, cr, &forceOutMtsLevel1->forceWithVirial(), enerd,
-                               simulationWork.useGpuPmePpCommunication, false, wcycle);
-    }
-
-    if (stepWork.computeForces)
-    {
-        /* If we don't use MTS or if we already combined the MTS forces before, we only
-         * need to post-process one ForceOutputs object here, called forceOutCombined,
-         * otherwise we have to post-process two outputs and then combine them.
-         */
-        ForceOutputs& forceOutCombined = (haveCombinedMtsForces ? forceOutMts.value() : forceOutMtsLevel0);
-        postProcessForces(cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutCombined,
-                          vir_force, mdatoms, fr, vsite, stepWork);
-
-        if (fr->useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
-        {
-            postProcessForces(cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1,
-                              vir_force, mdatoms, fr, vsite, stepWork);
-
-            combineMtsForces(mdatoms->homenr, force.unpaddedArrayRef(),
-                             forceView->forceMtsCombined(), inputrec->mtsLevels[1].stepFactor);
-        }
-    }
-
-    if (stepWork.computeEnergy)
-    {
-        /* Compute the final potential energy terms */
-        accumulatePotentialEnergies(enerd, lambda, inputrec->fepvals);
-
-        if (!EI_TPI(inputrec->eI))
-        {
-            checkPotentialEnergyValidity(step, *enerd, *inputrec);
-        }
-    }
-
-    /* In case we don't have constraints and are using GPUs, the next balancing
-     * region starts here.
-     * Some "special" work at the end of do_force_cuts?, such as vsite spread,
-     * virial calculation and COM pulling, is not thus not included in
-     * the balance timing, which is ok as most tasks do communication.
-     */
-    ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
-}
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.cpp b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.cpp
deleted file mode 100644
index 8706772915..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief This file declares helper functionality for legacy option handling for mdrun
- *
- * \author Berk Hess <hess@kth.se>
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \author Erik Lindahl <erik@kth.se>
- * \author Mark Abraham <mark.j.abraham@gmail.com>
- *
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include "legacymdrunoptions.h"
-
-#include <cstring>
-
-#include "gromacs/math/functions.h"
-#include "gromacs/utility/arrayref.h"
-#include "gromacs/utility/arraysize.h"
-#include "gromacs/utility/fatalerror.h"
-
-namespace gmx
-{
-
-/*! \brief Return whether the command-line parameter that
- *  will trigger a multi-simulation is set */
-static bool is_multisim_option_set(int argc, const char* const argv[])
-{
-    for (int i = 0; i < argc; ++i)
-    {
-        if (strcmp(argv[i], "-multidir") == 0)
-        {
-            return true;
-        }
-    }
-    return false;
-}
-
-int LegacyMdrunOptions::updateFromCommandLine(int argc, char** argv, ArrayRef<const char*> desc)
-{
-    unsigned long PCA_Flags = PCA_CAN_SET_DEFFNM;
-    // With -multidir, the working directory still needs to be
-    // changed, so we can't check for the existence of files during
-    // parsing.  It isn't useful to do any completion based on file
-    // system contents, either.
-    if (is_multisim_option_set(argc, argv))
-    {
-        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
-    }
-
-    if (!parse_common_args(&argc, argv, PCA_Flags, ssize(filenames), filenames.data(), asize(pa),
-                           pa, ssize(desc), desc.data(), 0, nullptr, &oenv))
-    {
-        return 0;
-    }
-
-    // Handle the options that permits the user to either declare
-    // which compatible GPUs are availble for use, or to select a GPU
-    // task assignment. Either could be in an environment variable (so
-    // that there is a way to customize it, when using MPI in
-    // heterogeneous contexts).
-    {
-        // TODO Argument parsing can't handle std::string. We should
-        // fix that by changing the parsing, once more of the roles of
-        // handling, validating and implementing defaults for user
-        // command-line options have been seperated.
-        hw_opt.gpuIdsAvailable       = gpuIdsAvailable;
-        hw_opt.userGpuTaskAssignment = userGpuTaskAssignment;
-
-        const char* env = getenv("GMX_GPU_ID");
-        if (env != nullptr)
-        {
-            if (!hw_opt.gpuIdsAvailable.empty())
-            {
-                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
-            }
-            hw_opt.gpuIdsAvailable = env;
-        }
-
-        env = getenv("GMX_GPUTASKS");
-        if (env != nullptr)
-        {
-            if (!hw_opt.userGpuTaskAssignment.empty())
-            {
-                gmx_fatal(FARGS, "GMX_GPUTASKS and -gputasks can not be used at the same time");
-            }
-            hw_opt.userGpuTaskAssignment = env;
-        }
-
-        if (!hw_opt.gpuIdsAvailable.empty() && !hw_opt.userGpuTaskAssignment.empty())
-        {
-            gmx_fatal(FARGS, "-gpu_id and -gputasks cannot be used at the same time");
-        }
-    }
-
-    hw_opt.threadAffinity = static_cast<ThreadAffinity>(nenum(thread_aff_opt_choices));
-
-    if (!opt2parg_bSet("-append", asize(pa), pa))
-    {
-        mdrunOptions.appendingBehavior = AppendingBehavior::Auto;
-    }
-    else
-    {
-        if (opt2parg_bool("-append", asize(pa), pa))
-        {
-            mdrunOptions.appendingBehavior = AppendingBehavior::Appending;
-        }
-        else
-        {
-            mdrunOptions.appendingBehavior = AppendingBehavior::NoAppending;
-        }
-    }
-
-    mdrunOptions.rerun            = opt2bSet("-rerun", ssize(filenames), filenames.data());
-    mdrunOptions.ntompOptionIsSet = opt2parg_bSet("-ntomp", asize(pa), pa);
-
-    domdecOptions.rankOrder    = static_cast<DdRankOrder>(nenum(ddrank_opt_choices));
-    domdecOptions.dlbOption    = static_cast<DlbOption>(nenum(dddlb_opt_choices));
-    domdecOptions.numCells[XX] = roundToInt(realddxyz[XX]);
-    domdecOptions.numCells[YY] = roundToInt(realddxyz[YY]);
-    domdecOptions.numCells[ZZ] = roundToInt(realddxyz[ZZ]);
-
-    /* PLUMED */
-    plumedswitch=0;
-    if (opt2bSet("-plumed", static_cast<int>(filenames.size()), filenames.data())) plumedswitch=1;
-    if(plumedswitch){
-      int real_precision=sizeof(real);
-      real energyUnits=1.0;
-      real lengthUnits=1.0;
-      real timeUnits=1.0;
-  
-      if(!plumed_installed()){
-        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
-      }
-      plumedmain=plumed_create();
-      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
-      // this is not necessary for gromacs units:
-      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
-      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
-      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
-      //
-      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,static_cast<int>(filenames.size()), filenames.data()));
-      plumedswitch=1;
-    }
-    /* PLUMED HREX*/
-    if(getenv("PLUMED_HREX")) plumed_hrex=1;
-    if(plumed_hrex){
-      if(!plumedswitch) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) requires -plumed");
-      if(replExParams.exchangeInterval==0) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) replica exchange");
-      if(replExParams.numExchanges!=0) gmx_fatal(FARGS,"-hrex (or PLUMED_HREX) not compatible with -nex");
-    }
-    /* END PLUMED HREX */
-
-    /* END PLUMED */
-
-    return 1;
-}
-
-LegacyMdrunOptions::~LegacyMdrunOptions()
-{
-    output_env_done(oenv);
-}
-
-} // namespace gmx
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed
deleted file mode 100644
index 42d0a7df38..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief This file declares helper functionality for legacy option handling for mdrun
- *
- * \author Berk Hess <hess@kth.se>
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \author Erik Lindahl <erik@kth.se>
- * \author Mark Abraham <mark.j.abraham@gmail.com>
- *
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include "legacymdrunoptions.h"
-
-#include <cstring>
-
-#include "gromacs/math/functions.h"
-#include "gromacs/utility/arrayref.h"
-#include "gromacs/utility/arraysize.h"
-#include "gromacs/utility/fatalerror.h"
-
-namespace gmx
-{
-
-/*! \brief Return whether the command-line parameter that
- *  will trigger a multi-simulation is set */
-static bool is_multisim_option_set(int argc, const char* const argv[])
-{
-    for (int i = 0; i < argc; ++i)
-    {
-        if (strcmp(argv[i], "-multidir") == 0)
-        {
-            return true;
-        }
-    }
-    return false;
-}
-
-int LegacyMdrunOptions::updateFromCommandLine(int argc, char** argv, ArrayRef<const char*> desc)
-{
-    unsigned long PCA_Flags = PCA_CAN_SET_DEFFNM;
-    // With -multidir, the working directory still needs to be
-    // changed, so we can't check for the existence of files during
-    // parsing.  It isn't useful to do any completion based on file
-    // system contents, either.
-    if (is_multisim_option_set(argc, argv))
-    {
-        PCA_Flags |= PCA_DISABLE_INPUT_FILE_CHECKING;
-    }
-
-    if (!parse_common_args(&argc, argv, PCA_Flags, ssize(filenames), filenames.data(), asize(pa),
-                           pa, ssize(desc), desc.data(), 0, nullptr, &oenv))
-    {
-        return 0;
-    }
-
-    // Handle the options that permits the user to either declare
-    // which compatible GPUs are availble for use, or to select a GPU
-    // task assignment. Either could be in an environment variable (so
-    // that there is a way to customize it, when using MPI in
-    // heterogeneous contexts).
-    {
-        // TODO Argument parsing can't handle std::string. We should
-        // fix that by changing the parsing, once more of the roles of
-        // handling, validating and implementing defaults for user
-        // command-line options have been seperated.
-        hw_opt.gpuIdsAvailable       = gpuIdsAvailable;
-        hw_opt.userGpuTaskAssignment = userGpuTaskAssignment;
-
-        const char* env = getenv("GMX_GPU_ID");
-        if (env != nullptr)
-        {
-            if (!hw_opt.gpuIdsAvailable.empty())
-            {
-                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
-            }
-            hw_opt.gpuIdsAvailable = env;
-        }
-
-        env = getenv("GMX_GPUTASKS");
-        if (env != nullptr)
-        {
-            if (!hw_opt.userGpuTaskAssignment.empty())
-            {
-                gmx_fatal(FARGS, "GMX_GPUTASKS and -gputasks can not be used at the same time");
-            }
-            hw_opt.userGpuTaskAssignment = env;
-        }
-
-        if (!hw_opt.gpuIdsAvailable.empty() && !hw_opt.userGpuTaskAssignment.empty())
-        {
-            gmx_fatal(FARGS, "-gpu_id and -gputasks cannot be used at the same time");
-        }
-    }
-
-    hw_opt.threadAffinity = static_cast<ThreadAffinity>(nenum(thread_aff_opt_choices));
-
-    if (!opt2parg_bSet("-append", asize(pa), pa))
-    {
-        mdrunOptions.appendingBehavior = AppendingBehavior::Auto;
-    }
-    else
-    {
-        if (opt2parg_bool("-append", asize(pa), pa))
-        {
-            mdrunOptions.appendingBehavior = AppendingBehavior::Appending;
-        }
-        else
-        {
-            mdrunOptions.appendingBehavior = AppendingBehavior::NoAppending;
-        }
-    }
-
-    mdrunOptions.rerun            = opt2bSet("-rerun", ssize(filenames), filenames.data());
-    mdrunOptions.ntompOptionIsSet = opt2parg_bSet("-ntomp", asize(pa), pa);
-
-    domdecOptions.rankOrder    = static_cast<DdRankOrder>(nenum(ddrank_opt_choices));
-    domdecOptions.dlbOption    = static_cast<DlbOption>(nenum(dddlb_opt_choices));
-    domdecOptions.numCells[XX] = roundToInt(realddxyz[XX]);
-    domdecOptions.numCells[YY] = roundToInt(realddxyz[YY]);
-    domdecOptions.numCells[ZZ] = roundToInt(realddxyz[ZZ]);
-
-    return 1;
-}
-
-LegacyMdrunOptions::~LegacyMdrunOptions()
-{
-    output_env_done(oenv);
-}
-
-} // namespace gmx
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.h b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.h
deleted file mode 100644
index 13ee9b89f9..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.h
+++ /dev/null
@@ -1,404 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \libinternal \file
- *
- * \brief This file declares helper functionality for legacy option handling for mdrun
- *
- * \author Berk Hess <hess@kth.se>
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \author Erik Lindahl <erik@kth.se>
- * \author Mark Abraham <mark.j.abraham@gmail.com>
- *
- * \ingroup module_mdrun
- * \inlibraryapi
- */
-#ifndef GMX_MDRUN_LEGACYMDRUNOPTIONS_H
-#define GMX_MDRUN_LEGACYMDRUNOPTIONS_H
-
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/commandline/pargs.h"
-#include "gromacs/domdec/options.h"
-#include "gromacs/hardware/hw_info.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-
-#include "replicaexchange.h"
-
-/* PLUMED */
-#include "../../../Plumed.h"
-extern int    plumedswitch;
-extern plumed plumedmain; 
-/* END PLUMED */
-
-/* PLUMED HREX */
-extern int plumed_hrex;
-/* END PLUMED HREX */
-
-namespace gmx
-
-{
-
-/*! \libinternal
- * \brief This class provides the same command-line option
- * functionality to both CLI and API sessions.
- *
- * This class should not exist, but is necessary now to introduce
- * support for the CLI and API without duplicating code. It should be
- * eliminated following the TODOs below.
- *
- * \warning Instances provide lifetime scope for members that do not have
- *  effective lifetime management or which are frequently accessed unsafely.
- *  The caller is responsible for keeping a LegacyMdrunOptions object alive
- *  for as long as any consumers, direct or transitive.
- *
- * \todo Modules in mdrun should acquire proper option handling so
- *       that all of these declarations and defaults are local to the
- *       modules.
- *
- * \todo Contextual aspects, such as working directory
- *       and environment variable handling are more properly
- *       the role of SimulationContext, and should be moved there.
- */
-class LegacyMdrunOptions
-{
-public:
-    //! Ongoing collection of mdrun options
-    MdrunOptions mdrunOptions;
-    //! Options for the domain decomposition.
-    DomdecOptions domdecOptions;
-    //! Parallelism-related user options.
-    gmx_hw_opt_t hw_opt;
-    //! Command-line override for the duration of a neighbor list with the Verlet scheme.
-    int nstlist_cmdline = 0;
-    //! Parameters for replica-exchange simulations.
-    ReplicaExchangeParameters replExParams;
-
-    //! Filename options to fill from command-line argument values.
-    std::vector<t_filenm> filenames = { { { efTPR, nullptr, nullptr, ffREAD },
-                                          { efTRN, "-o", nullptr, ffWRITE },
-                                          { efCOMPRESSED, "-x", nullptr, ffOPTWR },
-                                          { efCPT, "-cpi", nullptr, ffOPTRD | ffALLOW_MISSING },
-                                          { efCPT, "-cpo", nullptr, ffOPTWR },
-                                          { efSTO, "-c", "confout", ffWRITE },
-                                          { efEDR, "-e", "ener", ffWRITE },
-                                          { efLOG, "-g", "md", ffWRITE },
-                                          { efXVG, "-dhdl", "dhdl", ffOPTWR },
-                                          { efXVG, "-field", "field", ffOPTWR },
-                                          { efXVG, "-table", "table", ffOPTRD },
-                                          { efXVG, "-tablep", "tablep", ffOPTRD },
-                                          { efXVG, "-tableb", "table", ffOPTRDMULT },
-                                          { efTRX, "-rerun", "rerun", ffOPTRD },
-                                          { efXVG, "-tpi", "tpi", ffOPTWR },
-                                          { efXVG, "-tpid", "tpidist", ffOPTWR },
-                                          { efEDI, "-ei", "sam", ffOPTRD },
-                                          { efXVG, "-eo", "edsam", ffOPTWR },
-                                          { efXVG, "-px", "pullx", ffOPTWR },
-                                          { efXVG, "-pf", "pullf", ffOPTWR },
-                                          { efXVG, "-ro", "rotation", ffOPTWR },
-                                          { efLOG, "-ra", "rotangles", ffOPTWR },
-                                          { efLOG, "-rs", "rotslabs", ffOPTWR },
-                                          { efLOG, "-rt", "rottorque", ffOPTWR },
-                                          { efMTX, "-mtx", "nm", ffOPTWR },
-                                          { efRND, "-multidir", nullptr, ffOPTRDMULT },
-                                          { efXVG, "-awh", "awhinit", ffOPTRD },
-                                          { efDAT, "-plumed", "plumed", ffOPTRD },  /* PLUMED */
-                                          { efDAT, "-membed", "membed", ffOPTRD },
-                                          { efTOP, "-mp", "membed", ffOPTRD },
-                                          { efNDX, "-mn", "membed", ffOPTRD },
-                                          { efXVG, "-if", "imdforces", ffOPTWR },
-                                          { efXVG, "-swap", "swapions", ffOPTWR } } };
-
-    //! Print a warning if any force is larger than this (in kJ/mol nm).
-    real pforce = -1;
-
-    //! The value of the -append option
-    bool appendOption = true;
-
-    /*! \brief Output context for writing text files
-     *
-     * \todo Clarify initialization, ownership, and lifetime. */
-    gmx_output_env_t* oenv = nullptr;
-
-    /*! \brief Command line options, defaults, docs and storage for them to fill. */
-    /*! \{ */
-    rvec        realddxyz                                                    = { 0, 0, 0 };
-    const char* ddrank_opt_choices[static_cast<int>(DdRankOrder::Count) + 1] = {
-        nullptr, "interleave", "pp_pme", "cartesian", nullptr
-    };
-    const char* dddlb_opt_choices[static_cast<int>(DlbOption::Count) + 1] = { nullptr, "auto", "no",
-                                                                              "yes", nullptr };
-    const char* thread_aff_opt_choices[static_cast<int>(ThreadAffinity::Count) + 1] = {
-        nullptr, "auto", "on", "off", nullptr
-    };
-    const char* nbpu_opt_choices[5]    = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* pme_opt_choices[5]     = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* pme_fft_opt_choices[5] = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* bonded_opt_choices[5]  = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* update_opt_choices[5]  = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* gpuIdsAvailable        = "";
-    const char* userGpuTaskAssignment  = "";
-
-
-    ImdOptions& imdOptions = mdrunOptions.imdOptions;
-
-        t_pargs           pa[49] = {
-
-        { "-dd", FALSE, etRVEC, { &realddxyz }, "Domain decomposition grid, 0 is optimize" },
-        { "-ddorder", FALSE, etENUM, { ddrank_opt_choices }, "DD rank order" },
-        { "-npme",
-          FALSE,
-          etINT,
-          { &domdecOptions.numPmeRanks },
-          "Number of separate ranks to be used for PME, -1 is guess" },
-        { "-nt",
-          FALSE,
-          etINT,
-          { &hw_opt.nthreads_tot },
-          "Total number of threads to start (0 is guess)" },
-        { "-ntmpi",
-          FALSE,
-          etINT,
-          { &hw_opt.nthreads_tmpi },
-          "Number of thread-MPI ranks to start (0 is guess)" },
-        { "-ntomp",
-          FALSE,
-          etINT,
-          { &hw_opt.nthreads_omp },
-          "Number of OpenMP threads per MPI rank to start (0 is guess)" },
-        { "-ntomp_pme",
-          FALSE,
-          etINT,
-          { &hw_opt.nthreads_omp_pme },
-          "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
-        { "-pin",
-          FALSE,
-          etENUM,
-          { thread_aff_opt_choices },
-          "Whether mdrun should try to set thread affinities" },
-        { "-pinoffset",
-          FALSE,
-          etINT,
-          { &hw_opt.core_pinning_offset },
-          "The lowest logical core number to which mdrun should pin the first thread" },
-        { "-pinstride",
-          FALSE,
-          etINT,
-          { &hw_opt.core_pinning_stride },
-          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads "
-          "per physical core" },
-        { "-gpu_id",
-          FALSE,
-          etSTR,
-          { &gpuIdsAvailable },
-          "List of unique GPU device IDs available to use" },
-        { "-gputasks",
-          FALSE,
-          etSTR,
-          { &userGpuTaskAssignment },
-          "List of GPU device IDs, mapping each PP task on each node to a device" },
-        { "-ddcheck",
-          FALSE,
-          etBOOL,
-          { &domdecOptions.checkBondedInteractions },
-          "Check for all bonded interactions with DD" },
-        { "-ddbondcomm",
-          FALSE,
-          etBOOL,
-          { &domdecOptions.useBondedCommunication },
-          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
-        { "-rdd",
-          FALSE,
-          etREAL,
-          { &domdecOptions.minimumCommunicationRange },
-          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial "
-          "coordinates" },
-        { "-rcon",
-          FALSE,
-          etREAL,
-          { &domdecOptions.constraintCommunicationRange },
-          "Maximum distance for P-LINCS (nm), 0 is estimate" },
-        { "-dlb", FALSE, etENUM, { dddlb_opt_choices }, "Dynamic load balancing (with DD)" },
-        { "-dds",
-          FALSE,
-          etREAL,
-          { &domdecOptions.dlbScaling },
-          "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in "
-          "order to "
-          "provide a margin in which dynamic load balancing can act while preserving the minimum "
-          "cell size." },
-        { "-ddcsx",
-          FALSE,
-          etSTR,
-          { &domdecOptions.cellSizeX },
-          "HIDDENA string containing a vector of the relative sizes in the x "
-          "direction of the corresponding DD cells. Only effective with static "
-          "load balancing." },
-        { "-ddcsy",
-          FALSE,
-          etSTR,
-          { &domdecOptions.cellSizeY },
-          "HIDDENA string containing a vector of the relative sizes in the y "
-          "direction of the corresponding DD cells. Only effective with static "
-          "load balancing." },
-        { "-ddcsz",
-          FALSE,
-          etSTR,
-          { &domdecOptions.cellSizeZ },
-          "HIDDENA string containing a vector of the relative sizes in the z "
-          "direction of the corresponding DD cells. Only effective with static "
-          "load balancing." },
-        { "-nb", FALSE, etENUM, { nbpu_opt_choices }, "Calculate non-bonded interactions on" },
-        { "-nstlist",
-          FALSE,
-          etINT,
-          { &nstlist_cmdline },
-          "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
-        { "-tunepme",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.tunePme },
-          "Optimize PME load between PP/PME ranks or GPU/CPU" },
-        { "-pme", FALSE, etENUM, { pme_opt_choices }, "Perform PME calculations on" },
-        { "-pmefft", FALSE, etENUM, { pme_fft_opt_choices }, "Perform PME FFT calculations on" },
-        { "-bonded", FALSE, etENUM, { bonded_opt_choices }, "Perform bonded calculations on" },
-        { "-update", FALSE, etENUM, { update_opt_choices }, "Perform update and constraints on" },
-        { "-v", FALSE, etBOOL, { &mdrunOptions.verbose }, "Be loud and noisy" },
-        { "-pforce", FALSE, etREAL, { &pforce }, "Print all forces larger than this (kJ/mol nm)" },
-        { "-reprod",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.reproducible },
-          "Try to avoid optimizations that affect binary reproducibility" },
-        { "-cpt",
-          FALSE,
-          etREAL,
-          { &mdrunOptions.checkpointOptions.period },
-          "Checkpoint interval (minutes)" },
-        { "-cpnum",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles },
-          "Keep and number checkpoint files" },
-        { "-append",
-          FALSE,
-          etBOOL,
-          { &appendOption },
-          "Append to previous output files when continuing from checkpoint instead of adding the "
-          "simulation part number to all file names" },
-        { "-nsteps",
-          FALSE,
-          etINT64,
-          { &mdrunOptions.numStepsCommandline },
-          "Run this number of steps (-1 means infinite, -2 means use mdp option, smaller is "
-          "invalid)" },
-        { "-maxh",
-          FALSE,
-          etREAL,
-          { &mdrunOptions.maximumHoursToRun },
-          "Terminate after 0.99 times this time (hours)" },
-        { "-replex",
-          FALSE,
-          etINT,
-          { &replExParams.exchangeInterval },
-          "Attempt replica exchange periodically with this period (steps)" },
-        { "-nex",
-          FALSE,
-          etINT,
-          { &replExParams.numExchanges },
-          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion). "
-          " -nex zero or not specified gives neighbor replica exchange." },
-        { "-reseed",
-          FALSE,
-          etINT,
-          { &replExParams.randomSeed },
-          "Seed for replica exchange, -1 is generate a seed" },
-        { "-hrex",  FALSE, etBOOL, {&plumed_hrex}, /* PLUMED HREX */
-              "Enable hamiltonian replica exchange" },
-        { "-imdport", FALSE, etINT, { &imdOptions.port }, "HIDDENIMD listening port" },
-        { "-imdwait",
-          FALSE,
-          etBOOL,
-          { &imdOptions.wait },
-          "HIDDENPause the simulation while no IMD client is connected" },
-        { "-imdterm",
-          FALSE,
-          etBOOL,
-          { &imdOptions.terminatable },
-          "HIDDENAllow termination of the simulation from IMD client" },
-        { "-imdpull",
-          FALSE,
-          etBOOL,
-          { &imdOptions.pull },
-          "HIDDENAllow pulling in the simulation from IMD client" },
-        { "-rerunvsite",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.rerunConstructVsites },
-          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
-        { "-confout",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.writeConfout },
-          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last "
-          "step" },
-        { "-stepout",
-          FALSE,
-          etINT,
-          { &mdrunOptions.verboseStepPrintInterval },
-          "HIDDENFrequency of writing the remaining wall clock time for the run" },
-        { "-resetstep",
-          FALSE,
-          etINT,
-          { &mdrunOptions.timingOptions.resetStep },
-          "HIDDENReset cycle counters after these many time steps" },
-        { "-resethway",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.timingOptions.resetHalfway },
-          "HIDDENReset the cycle counters after half the number of steps or halfway "
-          "[TT]-maxh[tt]" }
-    };
-    /*! \} */
-
-    //! Parses the command-line input and prepares to start mdrun.
-    int updateFromCommandLine(int argc, char** argv, ArrayRef<const char*> desc);
-
-    ~LegacyMdrunOptions();
-};
-
-} // end namespace gmx
-
-#endif
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed
deleted file mode 100644
index 474f6f0396..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \libinternal \file
- *
- * \brief This file declares helper functionality for legacy option handling for mdrun
- *
- * \author Berk Hess <hess@kth.se>
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \author Erik Lindahl <erik@kth.se>
- * \author Mark Abraham <mark.j.abraham@gmail.com>
- *
- * \ingroup module_mdrun
- * \inlibraryapi
- */
-#ifndef GMX_MDRUN_LEGACYMDRUNOPTIONS_H
-#define GMX_MDRUN_LEGACYMDRUNOPTIONS_H
-
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/commandline/pargs.h"
-#include "gromacs/domdec/options.h"
-#include "gromacs/hardware/hw_info.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-
-#include "replicaexchange.h"
-
-namespace gmx
-{
-
-/*! \libinternal
- * \brief This class provides the same command-line option
- * functionality to both CLI and API sessions.
- *
- * This class should not exist, but is necessary now to introduce
- * support for the CLI and API without duplicating code. It should be
- * eliminated following the TODOs below.
- *
- * \warning Instances provide lifetime scope for members that do not have
- *  effective lifetime management or which are frequently accessed unsafely.
- *  The caller is responsible for keeping a LegacyMdrunOptions object alive
- *  for as long as any consumers, direct or transitive.
- *
- * \todo Modules in mdrun should acquire proper option handling so
- *       that all of these declarations and defaults are local to the
- *       modules.
- *
- * \todo Contextual aspects, such as working directory
- *       and environment variable handling are more properly
- *       the role of SimulationContext, and should be moved there.
- */
-class LegacyMdrunOptions
-{
-public:
-    //! Ongoing collection of mdrun options
-    MdrunOptions mdrunOptions;
-    //! Options for the domain decomposition.
-    DomdecOptions domdecOptions;
-    //! Parallelism-related user options.
-    gmx_hw_opt_t hw_opt;
-    //! Command-line override for the duration of a neighbor list with the Verlet scheme.
-    int nstlist_cmdline = 0;
-    //! Parameters for replica-exchange simulations.
-    ReplicaExchangeParameters replExParams;
-
-    //! Filename options to fill from command-line argument values.
-    std::vector<t_filenm> filenames = { { { efTPR, nullptr, nullptr, ffREAD },
-                                          { efTRN, "-o", nullptr, ffWRITE },
-                                          { efCOMPRESSED, "-x", nullptr, ffOPTWR },
-                                          { efCPT, "-cpi", nullptr, ffOPTRD | ffALLOW_MISSING },
-                                          { efCPT, "-cpo", nullptr, ffOPTWR },
-                                          { efSTO, "-c", "confout", ffWRITE },
-                                          { efEDR, "-e", "ener", ffWRITE },
-                                          { efLOG, "-g", "md", ffWRITE },
-                                          { efXVG, "-dhdl", "dhdl", ffOPTWR },
-                                          { efXVG, "-field", "field", ffOPTWR },
-                                          { efXVG, "-table", "table", ffOPTRD },
-                                          { efXVG, "-tablep", "tablep", ffOPTRD },
-                                          { efXVG, "-tableb", "table", ffOPTRDMULT },
-                                          { efTRX, "-rerun", "rerun", ffOPTRD },
-                                          { efXVG, "-tpi", "tpi", ffOPTWR },
-                                          { efXVG, "-tpid", "tpidist", ffOPTWR },
-                                          { efEDI, "-ei", "sam", ffOPTRD },
-                                          { efXVG, "-eo", "edsam", ffOPTWR },
-                                          { efXVG, "-px", "pullx", ffOPTWR },
-                                          { efXVG, "-pf", "pullf", ffOPTWR },
-                                          { efXVG, "-ro", "rotation", ffOPTWR },
-                                          { efLOG, "-ra", "rotangles", ffOPTWR },
-                                          { efLOG, "-rs", "rotslabs", ffOPTWR },
-                                          { efLOG, "-rt", "rottorque", ffOPTWR },
-                                          { efMTX, "-mtx", "nm", ffOPTWR },
-                                          { efRND, "-multidir", nullptr, ffOPTRDMULT },
-                                          { efXVG, "-awh", "awhinit", ffOPTRD },
-                                          { efDAT, "-membed", "membed", ffOPTRD },
-                                          { efTOP, "-mp", "membed", ffOPTRD },
-                                          { efNDX, "-mn", "membed", ffOPTRD },
-                                          { efXVG, "-if", "imdforces", ffOPTWR },
-                                          { efXVG, "-swap", "swapions", ffOPTWR } } };
-
-    //! Print a warning if any force is larger than this (in kJ/mol nm).
-    real pforce = -1;
-
-    //! The value of the -append option
-    bool appendOption = true;
-
-    /*! \brief Output context for writing text files
-     *
-     * \todo Clarify initialization, ownership, and lifetime. */
-    gmx_output_env_t* oenv = nullptr;
-
-    /*! \brief Command line options, defaults, docs and storage for them to fill. */
-    /*! \{ */
-    rvec        realddxyz                                                    = { 0, 0, 0 };
-    const char* ddrank_opt_choices[static_cast<int>(DdRankOrder::Count) + 1] = {
-        nullptr, "interleave", "pp_pme", "cartesian", nullptr
-    };
-    const char* dddlb_opt_choices[static_cast<int>(DlbOption::Count) + 1] = { nullptr, "auto", "no",
-                                                                              "yes", nullptr };
-    const char* thread_aff_opt_choices[static_cast<int>(ThreadAffinity::Count) + 1] = {
-        nullptr, "auto", "on", "off", nullptr
-    };
-    const char* nbpu_opt_choices[5]    = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* pme_opt_choices[5]     = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* pme_fft_opt_choices[5] = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* bonded_opt_choices[5]  = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* update_opt_choices[5]  = { nullptr, "auto", "cpu", "gpu", nullptr };
-    const char* gpuIdsAvailable        = "";
-    const char* userGpuTaskAssignment  = "";
-
-
-    ImdOptions& imdOptions = mdrunOptions.imdOptions;
-
-    t_pargs pa[48] = {
-
-        { "-dd", FALSE, etRVEC, { &realddxyz }, "Domain decomposition grid, 0 is optimize" },
-        { "-ddorder", FALSE, etENUM, { ddrank_opt_choices }, "DD rank order" },
-        { "-npme",
-          FALSE,
-          etINT,
-          { &domdecOptions.numPmeRanks },
-          "Number of separate ranks to be used for PME, -1 is guess" },
-        { "-nt",
-          FALSE,
-          etINT,
-          { &hw_opt.nthreads_tot },
-          "Total number of threads to start (0 is guess)" },
-        { "-ntmpi",
-          FALSE,
-          etINT,
-          { &hw_opt.nthreads_tmpi },
-          "Number of thread-MPI ranks to start (0 is guess)" },
-        { "-ntomp",
-          FALSE,
-          etINT,
-          { &hw_opt.nthreads_omp },
-          "Number of OpenMP threads per MPI rank to start (0 is guess)" },
-        { "-ntomp_pme",
-          FALSE,
-          etINT,
-          { &hw_opt.nthreads_omp_pme },
-          "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
-        { "-pin",
-          FALSE,
-          etENUM,
-          { thread_aff_opt_choices },
-          "Whether mdrun should try to set thread affinities" },
-        { "-pinoffset",
-          FALSE,
-          etINT,
-          { &hw_opt.core_pinning_offset },
-          "The lowest logical core number to which mdrun should pin the first thread" },
-        { "-pinstride",
-          FALSE,
-          etINT,
-          { &hw_opt.core_pinning_stride },
-          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads "
-          "per physical core" },
-        { "-gpu_id",
-          FALSE,
-          etSTR,
-          { &gpuIdsAvailable },
-          "List of unique GPU device IDs available to use" },
-        { "-gputasks",
-          FALSE,
-          etSTR,
-          { &userGpuTaskAssignment },
-          "List of GPU device IDs, mapping each PP task on each node to a device" },
-        { "-ddcheck",
-          FALSE,
-          etBOOL,
-          { &domdecOptions.checkBondedInteractions },
-          "Check for all bonded interactions with DD" },
-        { "-ddbondcomm",
-          FALSE,
-          etBOOL,
-          { &domdecOptions.useBondedCommunication },
-          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
-        { "-rdd",
-          FALSE,
-          etREAL,
-          { &domdecOptions.minimumCommunicationRange },
-          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial "
-          "coordinates" },
-        { "-rcon",
-          FALSE,
-          etREAL,
-          { &domdecOptions.constraintCommunicationRange },
-          "Maximum distance for P-LINCS (nm), 0 is estimate" },
-        { "-dlb", FALSE, etENUM, { dddlb_opt_choices }, "Dynamic load balancing (with DD)" },
-        { "-dds",
-          FALSE,
-          etREAL,
-          { &domdecOptions.dlbScaling },
-          "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in "
-          "order to "
-          "provide a margin in which dynamic load balancing can act while preserving the minimum "
-          "cell size." },
-        { "-ddcsx",
-          FALSE,
-          etSTR,
-          { &domdecOptions.cellSizeX },
-          "HIDDENA string containing a vector of the relative sizes in the x "
-          "direction of the corresponding DD cells. Only effective with static "
-          "load balancing." },
-        { "-ddcsy",
-          FALSE,
-          etSTR,
-          { &domdecOptions.cellSizeY },
-          "HIDDENA string containing a vector of the relative sizes in the y "
-          "direction of the corresponding DD cells. Only effective with static "
-          "load balancing." },
-        { "-ddcsz",
-          FALSE,
-          etSTR,
-          { &domdecOptions.cellSizeZ },
-          "HIDDENA string containing a vector of the relative sizes in the z "
-          "direction of the corresponding DD cells. Only effective with static "
-          "load balancing." },
-        { "-nb", FALSE, etENUM, { nbpu_opt_choices }, "Calculate non-bonded interactions on" },
-        { "-nstlist",
-          FALSE,
-          etINT,
-          { &nstlist_cmdline },
-          "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
-        { "-tunepme",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.tunePme },
-          "Optimize PME load between PP/PME ranks or GPU/CPU" },
-        { "-pme", FALSE, etENUM, { pme_opt_choices }, "Perform PME calculations on" },
-        { "-pmefft", FALSE, etENUM, { pme_fft_opt_choices }, "Perform PME FFT calculations on" },
-        { "-bonded", FALSE, etENUM, { bonded_opt_choices }, "Perform bonded calculations on" },
-        { "-update", FALSE, etENUM, { update_opt_choices }, "Perform update and constraints on" },
-        { "-v", FALSE, etBOOL, { &mdrunOptions.verbose }, "Be loud and noisy" },
-        { "-pforce", FALSE, etREAL, { &pforce }, "Print all forces larger than this (kJ/mol nm)" },
-        { "-reprod",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.reproducible },
-          "Try to avoid optimizations that affect binary reproducibility" },
-        { "-cpt",
-          FALSE,
-          etREAL,
-          { &mdrunOptions.checkpointOptions.period },
-          "Checkpoint interval (minutes)" },
-        { "-cpnum",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles },
-          "Keep and number checkpoint files" },
-        { "-append",
-          FALSE,
-          etBOOL,
-          { &appendOption },
-          "Append to previous output files when continuing from checkpoint instead of adding the "
-          "simulation part number to all file names" },
-        { "-nsteps",
-          FALSE,
-          etINT64,
-          { &mdrunOptions.numStepsCommandline },
-          "Run this number of steps (-1 means infinite, -2 means use mdp option, smaller is "
-          "invalid)" },
-        { "-maxh",
-          FALSE,
-          etREAL,
-          { &mdrunOptions.maximumHoursToRun },
-          "Terminate after 0.99 times this time (hours)" },
-        { "-replex",
-          FALSE,
-          etINT,
-          { &replExParams.exchangeInterval },
-          "Attempt replica exchange periodically with this period (steps)" },
-        { "-nex",
-          FALSE,
-          etINT,
-          { &replExParams.numExchanges },
-          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion). "
-          " -nex zero or not specified gives neighbor replica exchange." },
-        { "-reseed",
-          FALSE,
-          etINT,
-          { &replExParams.randomSeed },
-          "Seed for replica exchange, -1 is generate a seed" },
-        { "-imdport", FALSE, etINT, { &imdOptions.port }, "HIDDENIMD listening port" },
-        { "-imdwait",
-          FALSE,
-          etBOOL,
-          { &imdOptions.wait },
-          "HIDDENPause the simulation while no IMD client is connected" },
-        { "-imdterm",
-          FALSE,
-          etBOOL,
-          { &imdOptions.terminatable },
-          "HIDDENAllow termination of the simulation from IMD client" },
-        { "-imdpull",
-          FALSE,
-          etBOOL,
-          { &imdOptions.pull },
-          "HIDDENAllow pulling in the simulation from IMD client" },
-        { "-rerunvsite",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.rerunConstructVsites },
-          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
-        { "-confout",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.writeConfout },
-          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last "
-          "step" },
-        { "-stepout",
-          FALSE,
-          etINT,
-          { &mdrunOptions.verboseStepPrintInterval },
-          "HIDDENFrequency of writing the remaining wall clock time for the run" },
-        { "-resetstep",
-          FALSE,
-          etINT,
-          { &mdrunOptions.timingOptions.resetStep },
-          "HIDDENReset cycle counters after these many time steps" },
-        { "-resethway",
-          FALSE,
-          etBOOL,
-          { &mdrunOptions.timingOptions.resetHalfway },
-          "HIDDENReset the cycle counters after half the number of steps or halfway "
-          "[TT]-maxh[tt]" }
-    };
-    /*! \} */
-
-    //! Parses the command-line input and prepares to start mdrun.
-    int updateFromCommandLine(int argc, char** argv, ArrayRef<const char*> desc);
-
-    ~LegacyMdrunOptions();
-};
-
-} // end namespace gmx
-
-#endif
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/md.cpp b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/md.cpp
deleted file mode 100644
index 5f27c78570..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/md.cpp
+++ /dev/null
@@ -1,1936 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief Implements the integrator for normal molecular dynamics simulations
- *
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-
-#include <algorithm>
-#include <memory>
-#include <numeric>
-
-#include "gromacs/applied_forces/awh/awh.h"
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/domdec/collect.h"
-#include "gromacs/domdec/dlbtiming.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_network.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/gpuhaloexchange.h"
-#include "gromacs/domdec/mdsetup.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/essentialdynamics/edsam.h"
-#include "gromacs/ewald/pme_load_balancing.h"
-#include "gromacs/ewald/pme_pp.h"
-#include "gromacs/fileio/trxio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gpu_utils/device_stream_manager.h"
-#include "gromacs/gpu_utils/gpu_utils.h"
-#include "gromacs/math/units.h"
-#include "gromacs/imd/imd.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/invertmatrix.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/math/vectypes.h"
-#include "gromacs/mdlib/checkpointhandler.h"
-#include "gromacs/mdlib/compute_io.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/coupling.h"
-#include "gromacs/mdlib/ebin.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/energyoutput.h"
-#include "gromacs/mdlib/expanded.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/freeenergyparameters.h"
-#include "gromacs/mdlib/md_support.h"
-#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/mdoutf.h"
-#include "gromacs/mdlib/membed.h"
-#include "gromacs/mdlib/resethandler.h"
-#include "gromacs/mdlib/sighandler.h"
-#include "gromacs/mdlib/simulationsignal.h"
-#include "gromacs/mdlib/stat.h"
-#include "gromacs/mdlib/stophandler.h"
-#include "gromacs/mdlib/tgroup.h"
-#include "gromacs/mdlib/trajectory_writing.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/update_constrain_gpu.h"
-#include "gromacs/mdlib/vcm.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdrunutility/handlerestart.h"
-#include "gromacs/mdrunutility/multisim.h"
-#include "gromacs/mdrunutility/printtime.h"
-#include "gromacs/mdtypes/awh_history.h"
-#include "gromacs/mdtypes/awh_params.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/df_history.h"
-#include "gromacs/mdtypes/energyhistory.h"
-#include "gromacs/mdtypes/fcdata.h"
-#include "gromacs/mdtypes/forcebuffers.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/group.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/interaction_const.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-#include "gromacs/mdtypes/multipletimestepping.h"
-#include "gromacs/mdtypes/observableshistory.h"
-#include "gromacs/mdtypes/pullhistory.h"
-#include "gromacs/mdtypes/simulation_workload.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/mdtypes/state_propagator_data_gpu.h"
-#include "gromacs/modularsimulator/energydata.h"
-#include "gromacs/nbnxm/gpu_data_mgmt.h"
-#include "gromacs/nbnxm/nbnxm.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pulling/output.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/swap/swapcoords.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/walltime_accounting.h"
-#include "gromacs/topology/atoms.h"
-#include "gromacs/topology/idef.h"
-#include "gromacs/topology/mtop_util.h"
-#include "gromacs/topology/topology.h"
-#include "gromacs/trajectory/trajectoryframe.h"
-#include "gromacs/utility/basedefinitions.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/real.h"
-#include "gromacs/utility/smalloc.h"
-
-#include "legacysimulator.h"
-#include "replicaexchange.h"
-#include "shellfc.h"
-
-/* PLUMED */
-#include "../../../Plumed.h"
-extern int    plumedswitch;
-extern plumed plumedmain;
-/* END PLUMED */
-
-/* PLUMED HREX */
-extern int plumed_hrex;
-/* END PLUMED HREX */
-
-using gmx::SimulationSignaller;
-
-void gmx::LegacySimulator::do_md()
-{
-    // TODO Historically, the EM and MD "integrators" used different
-    // names for the t_inputrec *parameter, but these must have the
-    // same name, now that it's a member of a struct. We use this ir
-    // alias to avoid a large ripple of nearly useless changes.
-    // t_inputrec is being replaced by IMdpOptionsProvider, so this
-    // will go away eventually.
-    t_inputrec*  ir = inputrec;
-    int64_t      step, step_rel;
-    double       t, t0 = ir->init_t;
-    gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
-    gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
-    gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
-    gmx_bool     do_ene, do_log, do_verbose;
-    gmx_bool     bMasterState;
-    unsigned int force_flags;
-    tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
-    int    i, m;
-    rvec   mu_tot;
-    matrix pressureCouplingMu, M;
-    gmx_repl_ex_t     repl_ex = nullptr;
-    gmx_global_stat_t gstat;
-    gmx_shellfc_t*    shellfc;
-    gmx_bool          bSumEkinhOld, bDoReplEx, bDoReplExPrev, bExchanged, bNeedRepartition;
-    gmx_bool          bTemp, bPres, bTrotter;
-    real              dvdl_constr;
-    std::vector<RVec> cbuf;
-    matrix            lastbox;
-    int               lamnew = 0;
-    /* for FEP */
-    int       nstfep = 0;
-    double    cycles;
-    real      saved_conserved_quantity = 0;
-    real      last_ekin                = 0;
-    t_extmass MassQ;
-    char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
-
-    /* PME load balancing data for GPU kernels */
-    gmx_bool bPMETune         = FALSE;
-    gmx_bool bPMETunePrinting = FALSE;
-
-    bool bInteractiveMDstep = false;
-
-    /* PLUMED */
-    int plumedNeedsEnergy=0;
-    int plumedWantsToStop=0;
-    matrix plumed_vir;
-    real lambdaForce=0;
-    real realFepState=0;
-    /* END PLUMED */
-
-    /* Domain decomposition could incorrectly miss a bonded
-       interaction, but checking for that requires a global
-       communication stage, which does not otherwise happen in DD
-       code. So we do that alongside the first global energy reduction
-       after a new DD is made. These variables handle whether the
-       check happens, and the result it returns. */
-    bool shouldCheckNumberOfBondedInteractions = false;
-    int  totalNumberOfBondedInteractions       = -1;
-
-    SimulationSignals signals;
-    // Most global communnication stages don't propagate mdrun
-    // signals, and will use this object to achieve that.
-    SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
-
-    if (!mdrunOptions.writeConfout)
-    {
-        // This is on by default, and the main known use case for
-        // turning it off is for convenience in benchmarking, which is
-        // something that should not show up in the general user
-        // interface.
-        GMX_LOG(mdlog.info)
-                .asParagraph()
-                .appendText(
-                        "The -noconfout functionality is deprecated, and may be removed in a "
-                        "future version.");
-    }
-
-    /* md-vv uses averaged full step velocities for T-control
-       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
-       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
-    bTrotter = (EI_VV(ir->eI)
-                && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
-
-    const bool bRerunMD = false;
-
-    int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
-    bGStatEveryStep   = (nstglobalcomm == 1);
-
-    const SimulationGroups* groups = &top_global->groups;
-
-    std::unique_ptr<EssentialDynamics> ed = nullptr;
-    if (opt2bSet("-ei", nfile, fnm))
-    {
-        /* Initialize essential dynamics sampling */
-        ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
-                        ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
-    }
-    else if (observablesHistory->edsamHistory)
-    {
-        gmx_fatal(FARGS,
-                  "The checkpoint is from a run with essential dynamics sampling, "
-                  "but the current run did not specify the -ei option. "
-                  "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
-    }
-
-    int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
-    gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
-    initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
-    Update     upd(*ir, deform);
-    const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
-    const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
-
-    const t_fcdata& fcdata = *fr->fcdata;
-
-    bool simulationsShareState = false;
-    int  nstSignalComm         = nstglobalcomm;
-    {
-        // TODO This implementation of ensemble orientation restraints is nasty because
-        // a user can't just do multi-sim with single-sim orientation restraints.
-        bool usingEnsembleRestraints =
-                (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
-        bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
-
-        // Replica exchange, ensemble restraints and AWH need all
-        // simulations to remain synchronized, so they need
-        // checkpoints and stop conditions to act on the same step, so
-        // the propagation of such signals must take place between
-        // simulations, not just within simulations.
-        // TODO: Make algorithm initializers set these flags.
-        simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim || (plumedswitch && ms); // PLUMED hack, if we have multiple sim and plumed we usually want them to be in sync 
-
-        if (simulationsShareState)
-        {
-            // Inter-simulation signal communication does not need to happen
-            // often, so we use a minimum of 200 steps to reduce overhead.
-            const int c_minimumInterSimulationSignallingInterval = 200;
-            nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
-                            * nstglobalcomm;
-        }
-    }
-
-    if (startingBehavior != StartingBehavior::RestartWithAppending)
-    {
-        pleaseCiteCouplingAlgorithms(fplog, *ir);
-    }
-    gmx_mdoutf* outf =
-            init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
-                        top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
-                                   mdoutf_get_fp_dhdl(outf), false, startingBehavior,
-                                   simulationsShareState, mdModulesNotifier);
-
-    gstat = global_stat_init(ir);
-
-    const auto& simulationWork     = runScheduleWork->simulationWork;
-    const bool  useGpuForPme       = simulationWork.useGpuPme;
-    const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
-    const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
-    const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
-
-    /* Check for polarizable models and flexible constraints */
-    shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
-                                 ir->nstcalcenergy, DOMAINDECOMP(cr), useGpuForPme);
-
-    {
-        double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
-        if ((io > 2000) && MASTER(cr))
-        {
-            fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
-        }
-    }
-
-    // Local state only becomes valid now.
-    std::unique_ptr<t_state> stateInstance;
-    t_state*                 state;
-
-    gmx_localtop_t top(top_global->ffparams);
-
-    auto mdatoms = mdAtoms->mdatoms();
-
-    ForceBuffers f(fr->useMts, ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
-                                       ? PinningPolicy::PinnedIfSupported
-                                       : PinningPolicy::CannotBePinned);
-    if (DOMAINDECOMP(cr))
-    {
-        stateInstance = std::make_unique<t_state>();
-        state         = stateInstance.get();
-        dd_init_local_state(cr->dd, state_global, state);
-
-        /* Distribute the charge groups over the nodes from the master node */
-        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
-                            imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
-                            nrnb, nullptr, FALSE);
-        shouldCheckNumberOfBondedInteractions = true;
-        upd.setNumAtoms(state->natoms);
-    }
-    else
-    {
-        state_change_natoms(state_global, state_global->natoms);
-        /* Copy the pointer to the global state */
-        state = state_global;
-
-        /* Generate and initialize new topology */
-        mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
-
-        upd.setNumAtoms(state->natoms);
-    }
-
-    std::unique_ptr<UpdateConstrainGpu> integrator;
-
-    StatePropagatorDataGpu* stateGpu = fr->stateGpu;
-
-    // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
-    if (useGpuForUpdate)
-    {
-        GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
-                                   || constr->numConstraintsTotal() == 0,
-                           "Constraints in domain decomposition are only supported with update "
-                           "groups if using GPU update.\n");
-        GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
-                                   || constr->numConstraintsTotal() == 0,
-                           "SHAKE is not supported with GPU update.");
-        GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
-                           "Either PME or short-ranged non-bonded interaction tasks must run on "
-                           "the GPU to use GPU update.\n");
-        GMX_RELEASE_ASSERT(ir->eI == eiMD,
-                           "Only the md integrator is supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(
-                ir->etc != etcNOSEHOOVER,
-                "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(
-                ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN
-                        || ir->epc == epcCRESCALE,
-                "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
-                "with the GPU update.\n");
-        GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
-                           "Virtual sites are not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(ed == nullptr,
-                           "Essential dynamics is not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
-                           "Constraints pulling is not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
-                           "Orientation restraints are not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(
-                ir->efep == efepNO
-                        || (!haveFepPerturbedMasses(*top_global) && !havePerturbedConstraints(*top_global)),
-                "Free energy perturbation of masses and constraints are not supported with the GPU "
-                "update.");
-
-        if (constr != nullptr && constr->numConstraintsTotal() > 0)
-        {
-            GMX_LOG(mdlog.info)
-                    .asParagraph()
-                    .appendText("Updating coordinates and applying constraints on the GPU.");
-        }
-        else
-        {
-            GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
-        }
-        GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
-                           "Device stream manager should be initialized in order to use GPU "
-                           "update-constraints.");
-        GMX_RELEASE_ASSERT(
-                fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
-                "Update stream should be initialized in order to use GPU "
-                "update-constraints.");
-        integrator = std::make_unique<UpdateConstrainGpu>(
-                *ir, *top_global, fr->deviceStreamManager->context(),
-                fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
-                stateGpu->xUpdatedOnDevice(), wcycle);
-
-        integrator->setPbc(PbcType::Xyz, state->box);
-    }
-
-    if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
-    {
-        changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
-    }
-    if (useGpuForUpdate)
-    {
-        changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
-    }
-
-    // NOTE: The global state is no longer used at this point.
-    // But state_global is still used as temporary storage space for writing
-    // the global state to file and potentially for replica exchange.
-    // (Global topology should persist.)
-
-    update_mdatoms(mdatoms, state->lambda[efptMASS]);
-
-    if (ir->bExpanded)
-    {
-        /* Check nstexpanded here, because the grompp check was broken */
-        if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
-        {
-            gmx_fatal(FARGS,
-                      "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
-        }
-        init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist, mdlog);
-    }
-
-    if (MASTER(cr))
-    {
-        EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
-    }
-
-    preparePrevStepPullCom(ir, pull_work, mdatoms->massT, state, state_global, cr,
-                           startingBehavior != StartingBehavior::NewSimulation);
-
-    // TODO: Remove this by converting AWH into a ForceProvider
-    auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
-                                startingBehavior != StartingBehavior::NewSimulation,
-                                shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
-
-    if (useReplicaExchange && MASTER(cr))
-    {
-        repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
-    }
-    /* PME tuning is only supported in the Verlet scheme, with PME for
-     * Coulomb. It is not supported with only LJ PME. */
-    bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
-                && ir->cutoff_scheme != ecutsGROUP);
-
-    pme_load_balancing_t* pme_loadbal = nullptr;
-    if (bPMETune)
-    {
-        pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
-                         fr->nbv->useGpu());
-    }
-
-    if (!ir->bContinuation)
-    {
-        if (state->flags & (1U << estV))
-        {
-            auto v = makeArrayRef(state->v);
-            /* Set the velocities of vsites, shells and frozen atoms to zero */
-            for (i = 0; i < mdatoms->homenr; i++)
-            {
-                if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
-                {
-                    clear_rvec(v[i]);
-                }
-                else if (mdatoms->cFREEZE)
-                {
-                    for (m = 0; m < DIM; m++)
-                    {
-                        if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
-                        {
-                            v[i][m] = 0;
-                        }
-                    }
-                }
-            }
-        }
-
-        if (constr)
-        {
-            /* Constrain the initial coordinates and velocities */
-            do_constrain_first(fplog, constr, ir, mdatoms->nr, mdatoms->homenr,
-                               state->x.arrayRefWithPadding(), state->v.arrayRefWithPadding(),
-                               state->box, state->lambda[efptBONDED]);
-        }
-        if (vsite)
-        {
-            /* Construct the virtual sites for the initial configuration */
-            vsite->construct(state->x, ir->delta_t, {}, state->box);
-        }
-    }
-
-    if (ir->efep != efepNO)
-    {
-        /* Set free energy calculation frequency as the greatest common
-         * denominator of nstdhdl and repl_ex_nst. */
-        nstfep = ir->fepvals->nstdhdl;
-        if (ir->bExpanded)
-        {
-            nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
-        }
-        if (useReplicaExchange)
-        {
-            nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
-        }
-        if (ir->bDoAwh)
-        {
-            nstfep = std::gcd(ir->awhParams->nstSampleCoord, nstfep);
-        }
-    }
-
-    /* Be REALLY careful about what flags you set here. You CANNOT assume
-     * this is the first step, since we might be restarting from a checkpoint,
-     * and in that case we should not do any modifications to the state.
-     */
-    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
-
-    // When restarting from a checkpoint, it can be appropriate to
-    // initialize ekind from quantities in the checkpoint. Otherwise,
-    // compute_globals must initialize ekind before the simulation
-    // starts/restarts. However, only the master rank knows what was
-    // found in the checkpoint file, so we have to communicate in
-    // order to coordinate the restart.
-    //
-    // TODO Consider removing this communication if/when checkpoint
-    // reading directly follows .tpr reading, because all ranks can
-    // agree on hasReadEkinState at that time.
-    bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
-    if (PAR(cr))
-    {
-        gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
-    }
-    if (hasReadEkinState)
-    {
-        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
-    }
-
-    unsigned int cglo_flags =
-            (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
-             | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
-
-    bSumEkinhOld = FALSE;
-
-    t_vcm vcm(top_global->groups, *ir);
-    reportComRemovalInfo(fplog, vcm);
-
-    /* To minimize communication, compute_globals computes the COM velocity
-     * and the kinetic energy for the velocities without COM motion removed.
-     * Thus to get the kinetic energy without the COM contribution, we need
-     * to call compute_globals twice.
-     */
-    for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
-    {
-        unsigned int cglo_flags_iteration = cglo_flags;
-        if (bStopCM && cgloIteration == 0)
-        {
-            cglo_flags_iteration |= CGLO_STOPCM;
-            cglo_flags_iteration &= ~CGLO_TEMPERATURE;
-        }
-        compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                        makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
-                        enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
-                        state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                        cglo_flags_iteration
-                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
-                                                                         : 0));
-        if (cglo_flags_iteration & CGLO_STOPCM)
-        {
-            /* At initialization, do not pass x with acceleration-correction mode
-             * to avoid (incorrect) correction of the initial coordinates.
-             */
-            auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
-                                                                     : makeArrayRef(state->x);
-            process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
-            inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
-        }
-    }
-    checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
-                                    makeConstArrayRef(state->x), state->box,
-                                    &shouldCheckNumberOfBondedInteractions);
-    if (ir->eI == eiVVAK)
-    {
-        /* a second call to get the half step temperature initialized as well */
-        /* we do the same call as above, but turn the pressure off -- internally to
-           compute_globals, this is recognized as a velocity verlet half-step
-           kinetic energy calculation.  This minimized excess variables, but
-           perhaps loses some logic?*/
-
-        compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                        makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
-                        enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
-                        state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
-    }
-
-    /* Calculate the initial half step temperature, and save the ekinh_old */
-    if (startingBehavior == StartingBehavior::NewSimulation)
-    {
-        for (i = 0; (i < ir->opts.ngtc); i++)
-        {
-            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
-        }
-    }
-
-    /* need to make an initiation call to get the Trotter variables set, as well as other constants
-       for non-trotter temperature control */
-    auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
-
-    if (MASTER(cr))
-    {
-        if (!ir->bContinuation)
-        {
-            if (constr && ir->eConstrAlg == econtLINCS)
-            {
-                fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
-                        constr->rmsd());
-            }
-            if (EI_STATE_VELOCITY(ir->eI))
-            {
-                real temp = enerd->term[F_TEMP];
-                if (ir->eI != eiVV)
-                {
-                    /* Result of Ekin averaged over velocities of -half
-                     * and +half step, while we only have -half step here.
-                     */
-                    temp *= 2;
-                }
-                fprintf(fplog, "Initial temperature: %g K\n", temp);
-            }
-        }
-
-        char tbuf[20];
-        fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
-        if (ir->nsteps >= 0)
-        {
-            sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
-        }
-        else
-        {
-            sprintf(tbuf, "%s", "infinite");
-        }
-        if (ir->init_step > 0)
-        {
-            fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
-                    gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
-                    gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
-        }
-        else
-        {
-            fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
-        }
-        fprintf(fplog, "\n");
-    }
-
-    /* PLUMED */
-    if(plumedswitch){
-      /* detect plumed API version */
-      int pversion=0;
-      plumed_cmd(plumedmain,"getApiVersion",&pversion);
-      /* setting kbT is only implemented with api>1) */
-      real kbT=ir->opts.ref_t[0]*BOLTZ;
-      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
-      if(pversion>2){
-        int res=1;
-        if( (startingBehavior != StartingBehavior::NewSimulation) ) plumed_cmd(plumedmain,"setRestart",&res);
-      }
-
-      if(ms && ms->numSimulations_>1) {
-        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&ms->mastersComm_);
-        if(PAR(cr)){
-          if(DOMAINDECOMP(cr)) {
-            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-          }else{
-            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-          }
-        }
-        plumed_cmd(plumedmain,"GREX init",nullptr);
-      }
-      if(PAR(cr)){
-        if(DOMAINDECOMP(cr)) {
-          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-        }
-      }
-      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
-      plumed_cmd(plumedmain,"setMDEngine","gromacs");
-      plumed_cmd(plumedmain,"setLog",fplog);
-      real real_delta_t=ir->delta_t;
-      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
-      plumed_cmd(plumedmain,"init",nullptr);
-
-      if(PAR(cr)){
-        if(DOMAINDECOMP(cr)) {
-          int nat_home = dd_numHomeAtoms(*cr->dd);
-          plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
-          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
-        }
-      }
-      realFepState = state->fep_state;
-      plumed_cmd(plumedmain, "setExtraCV lambda", &realFepState);
-      plumed_cmd(plumedmain, "setExtraCVForce lambda", &lambdaForce);
-    }
-    /* END PLUMED */
-
-    walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
-    print_start(fplog, cr, walltime_accounting, "mdrun");
-
-    /***********************************************************
-     *
-     *             Loop over MD steps
-     *
-     ************************************************************/
-
-    bFirstStep = TRUE;
-    /* Skip the first Nose-Hoover integration when we get the state from tpx */
-    bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
-    bSumEkinhOld     = FALSE;
-    bExchanged       = FALSE;
-    bNeedRepartition = FALSE;
-    bDoReplEx        = FALSE;
-
-    step     = ir->init_step;
-    step_rel = 0;
-
-    auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
-            compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
-            MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
-            mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
-
-    auto checkpointHandler = std::make_unique<CheckpointHandler>(
-            compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
-            ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
-            mdrunOptions.checkpointOptions.period);
-
-    const bool resetCountersIsLocal = true;
-    auto       resetHandler         = std::make_unique<ResetHandler>(
-            compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
-            !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
-            mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
-
-    const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
-
-    if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
-    {
-        logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
-    }
-
-    /* and stop now if we should */
-    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
-    while (!bLastStep)
-    {
-
-        /* Determine if this is a neighbor search step */
-        bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
-
-        if (bPMETune && bNStList)
-        {
-            // This has to be here because PME load balancing is called so early.
-            // TODO: Move to after all booleans are defined.
-            if (useGpuForUpdate && !bFirstStep)
-            {
-                stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
-                stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-            }
-            /* PME grid + cut-off optimization with GPUs or PME nodes */
-            pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
-                           fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
-                           &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
-        }
-
-        wallcycle_start(wcycle, ewcSTEP);
-
-        bLastStep = (step_rel == ir->nsteps);
-        t         = t0 + step * ir->delta_t;
-
-        // TODO Refactor this, so that nstfep does not need a default value of zero
-        if (ir->efep != efepNO || ir->bSimTemp)
-        {
-            /* find and set the current lambdas */
-            state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
-
-            bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
-            bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
-            bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
-                           && (!bFirstStep));
-        }
-
-        bDoReplExPrev = bDoReplEx;
-        bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
-                     && do_per_step(step, replExParams.exchangeInterval));
-
-        if (doSimulatedAnnealing)
-        {
-            update_annealing_target_temp(ir, t, &upd);
-        }
-
-        /* Stop Center of Mass motion */
-        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
-
-        /* Determine whether or not to do Neighbour Searching */
-        bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
-
-        /* Note that the stopHandler will cause termination at nstglobalcomm
-         * steps. Since this concides with nstcalcenergy, nsttcouple and/or
-         * nstpcouple steps, we have computed the half-step kinetic energy
-         * of the previous step and can always output energies at the last step.
-         */
-        bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
-
-        /* do_log triggers energy and virial calculation. Because this leads
-         * to different code paths, forces can be different. Thus for exact
-         * continuation we should avoid extra log output.
-         * Note that the || bLastStep can result in non-exact continuation
-         * beyond the last step. But we don't consider that to be an issue.
-         */
-        do_log     = (do_per_step(step, ir->nstlog)
-                  || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
-        do_verbose = mdrunOptions.verbose
-                     && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
-
-        if (useGpuForUpdate && !bFirstStep && bNS)
-        {
-            // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
-            stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-            stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-            // Copy coordinate from the GPU when needed at the search step.
-            // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
-            // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
-            stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
-            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-        }
-
-        if (bNS && !(bFirstStep && ir->bContinuation))
-        {
-            bMasterState = FALSE;
-            /* Correct the new box if it is too skewed */
-            if (inputrecDynamicBox(ir))
-            {
-                if (correct_box(fplog, step, state->box))
-                {
-                    bMasterState = TRUE;
-                    // If update is offloaded, it should be informed about the box size change
-                    if (useGpuForUpdate)
-                    {
-                        integrator->setPbc(PbcType::Xyz, state->box);
-                    }
-                }
-            }
-            if (DOMAINDECOMP(cr) && bMasterState)
-            {
-                dd_collect_state(cr->dd, state, state_global);
-            }
-
-            if (DOMAINDECOMP(cr))
-            {
-                /* Repartition the domain decomposition */
-                dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
-                                    *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
-                                    fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
-                shouldCheckNumberOfBondedInteractions = true;
-                upd.setNumAtoms(state->natoms);
-
-                /* PLUMED */
-                if(plumedswitch){
-                  int nat_home = dd_numHomeAtoms(*cr->dd);
-                  plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
-                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
-                }
-                /* END PLUMED */
-            }
-        }
-
-        // Allocate or re-size GPU halo exchange object, if necessary
-        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
-        {
-            GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
-                               "GPU device manager has to be initialized to use GPU "
-                               "version of halo exchange.");
-            constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
-        }
-
-        if (MASTER(cr) && do_log)
-        {
-            gmx::EnergyOutput::printHeader(fplog, step,
-                                           t); /* can we improve the information printed here? */
-        }
-
-        if (ir->efep != efepNO)
-        {
-            update_mdatoms(mdatoms, state->lambda[efptMASS]);
-        }
-
-        if (bExchanged)
-        {
-
-            /* We need the kinetic energy at minus the half step for determining
-             * the full step kinetic energy and possibly for T-coupling.*/
-            /* This may not be quite working correctly yet . . . . */
-            compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                            makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
-                            enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
-                            state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
-            checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
-                                            &top, makeConstArrayRef(state->x), state->box,
-                                            &shouldCheckNumberOfBondedInteractions);
-        }
-        clear_mat(force_vir);
-
-        /* PLUMED HREX */
-        gmx_bool bHREX = bDoReplEx && plumed_hrex;
-
-        if (plumedswitch && bHREX) {
-          // gmx_enerdata_t *hrex_enerd;
-          int nlambda = enerd->foreignLambdaTerms.numLambdas();
-          gmx_enerdata_t hrex_enerd(enerd->grpp.nener, nlambda == 0 ? 0 : nlambda - 1);
-          int repl  = -1;
-          int nrepl = -1;
-          if (MASTER(cr)){
-            repl  = replica_exchange_get_repl(repl_ex);
-            nrepl = replica_exchange_get_nrepl(repl_ex);
-          }
-
-          if (DOMAINDECOMP(cr)) {
-            dd_collect_state(cr->dd,state,state_global);
-          } else {
-            copy_state_serial(state, state_global);
-          }
-
-          if(MASTER(cr)){
-            if(repl%2==step/replExParams.exchangeInterval%2){
-              if(repl-1>=0) exchange_state(ms,repl-1,state_global);
-            }else{
-              if(repl+1<nrepl) exchange_state(ms,repl+1,state_global);
-            }
-          }
-          if (!DOMAINDECOMP(cr)) {
-            copy_state_serial(state_global, state);
-          }
-          if(PAR(cr)){
-            if (DOMAINDECOMP(cr)) {
-              dd_partition_system(fplog,mdlog,step,cr,TRUE,1,
-                                  state_global,*top_global,ir,
-                                  imdSession, pull_work,
-                                  state,&f,mdAtoms,&top,fr,vsite,constr,
-                                  nrnb,wcycle,FALSE);
-            }
-          }
-          do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
-                   nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
-                   &f.view(), force_vir, mdatoms, &hrex_enerd, state->lambda, 
-                   fr, runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
-                   GMX_FORCE_STATECHANGED |
-                   GMX_FORCE_DYNAMICBOX |
-                   GMX_FORCE_ALLFORCES |
-                   GMX_FORCE_VIRIAL |
-                   GMX_FORCE_ENERGY |
-                   GMX_FORCE_DHDL |
-                   GMX_FORCE_NS,
-                   ddBalanceRegionHandler);
-
-          plumed_cmd(plumedmain,"GREX cacheLocalUSwap",&(&hrex_enerd)->term[F_EPOT]);
-
-          /* exchange back */
-          if (DOMAINDECOMP(cr)) {
-            dd_collect_state(cr->dd,state,state_global);
-          } else {
-            copy_state_serial(state, state_global);
-          }
-
-          if(MASTER(cr)){
-            if(repl%2==step/replExParams.exchangeInterval%2){
-              if(repl-1>=0) exchange_state(ms,repl-1,state_global);
-            }else{
-              if(repl+1<nrepl) exchange_state(ms,repl+1,state_global);
-            }
-          }
-
-          if (!DOMAINDECOMP(cr)) {
-            copy_state_serial(state_global, state);
-          }
-          if(PAR(cr)){
-            if (DOMAINDECOMP(cr)) {
-              dd_partition_system(fplog,mdlog,step,cr,TRUE,1,
-                                  state_global,*top_global,ir,
-                                  imdSession, pull_work,
-                                  state,&f,mdAtoms,&top,fr,vsite,constr,
-                                  nrnb,wcycle,FALSE);
-              int nat_home = dd_numHomeAtoms(*cr->dd);
-              plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
-              plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
-            }
-          }
-          bNS=true;
-        }
-        /* END PLUMED HREX */
-
-        checkpointHandler->decideIfCheckpointingThisStep(bNS||bDoReplExPrev, bFirstStep, bLastStep);
-
-        /* Determine the energy and pressure:
-         * at nstcalcenergy steps and at energy output steps (set below).
-         */
-        if (EI_VV(ir->eI) && (!bInitStep))
-        {
-            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
-            bCalcVir      = bCalcEnerStep
-                       || (ir->epc != epcNO
-                           && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
-        }
-        else
-        {
-            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
-            bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
-        }
-        bCalcEner = bCalcEnerStep;
-
-        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
-
-        if (do_ene || do_log || bDoReplEx)
-        {
-            bCalcVir  = TRUE;
-            bCalcEner = TRUE;
-        }
-
-        /* Do we need global communication ? */
-        bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
-                  || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
-
-        force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
-                       | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
-                       | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
-        if (fr->useMts && !do_per_step(step, ir->nstfout))
-        {
-            force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
-        }
-
-        if (shellfc)
-        {
-            /* Now is the time to relax the shells */
-            relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
-                                imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
-                                state->natoms, state->x.arrayRefWithPadding(),
-                                state->v.arrayRefWithPadding(), state->box, state->lambda,
-                                &state->hist, &f.view(), force_vir, mdatoms, nrnb, wcycle, shellfc,
-                                fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
-        }
-        else
-        {
-            /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
-               is updated (or the AWH update will be performed twice for one step when continuing).
-               It would be best to call this update function from do_md_trajectory_writing but that
-               would occur after do_force. One would have to divide the update_awh function into one
-               function applying the AWH force and one doing the AWH bias update. The update AWH
-               bias function could then be called after do_md_trajectory_writing (then containing
-               update_awh_history). The checkpointing will in the future probably moved to the start
-               of the md loop which will rid of this issue. */
-            if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
-            {
-                awh->updateHistory(state_global->awhHistory.get());
-            }
-
-            /* The coordinates (x) are shifted (to get whole molecules)
-             * in do_force.
-             * This is parallellized as well, and does communication too.
-             * Check comments in sim_util.c
-             */
-
-             /* PLUMED */
-            plumedNeedsEnergy=0;
-            if(plumedswitch){
-              int pversion=0;
-              plumed_cmd(plumedmain,"getApiVersion",&pversion);
-              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
-              plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
-              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
-              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
-              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
-              plumed_cmd(plumedmain,"prepareCalc",nullptr);
-              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
-              int checkp=0; if(checkpointHandler->isCheckpointingStep()) checkp=1;
-              if(pversion>3) plumed_cmd(plumedmain,"doCheckPoint",&checkp);
-              plumed_cmd(plumedmain,"setForces",&f.view().force()[0][0]);
-              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-              if(plumedNeedsEnergy) force_flags |= GMX_FORCE_ENERGY | GMX_FORCE_VIRIAL;
-              clear_mat(plumed_vir);
-              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
-            }
-            /* END PLUMED */
-            do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
-                     nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
-                     &f.view(), force_vir, mdatoms, enerd, state->lambda, fr, runScheduleWork,
-                     vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
-                     (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
-            /* PLUMED */
-            if(plumedswitch){
-              if(plumedNeedsEnergy){
-                msmul(force_vir,2.0,plumed_vir);
-                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-                plumed_cmd(plumedmain,"performCalc",nullptr);
-                msmul(plumed_vir,0.5,force_vir);
-              } else {
-                msmul(plumed_vir,0.5,plumed_vir);
-                m_add(force_vir,plumed_vir,force_vir);
-              }
-              if(bDoReplEx) plumed_cmd(plumedmain,"GREX savePositions",nullptr);
-              if(plumedWantsToStop) ir->nsteps=step_rel+1;
-              if(bHREX) plumed_cmd(plumedmain,"GREX cacheLocalUNow",&enerd->term[F_EPOT]);
-            }
-            /* END PLUMED */
-        }
-
-        // VV integrators do not need the following velocity half step
-        // if it is the first step after starting from a checkpoint.
-        // That is, the half step is needed on all other steps, and
-        // also the first step when starting from a .tpr file.
-        if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
-        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
-        {
-            rvec* vbuf = nullptr;
-
-            wallcycle_start(wcycle, ewcUPDATE);
-            if (ir->eI == eiVV && bInitStep)
-            {
-                /* if using velocity verlet with full time step Ekin,
-                 * take the first half step only to compute the
-                 * virial for the first step. From there,
-                 * revert back to the initial coordinates
-                 * so that the input is actually the initial step.
-                 */
-                snew(vbuf, state->natoms);
-                copy_rvecn(state->v.rvec_array(), vbuf, 0,
-                           state->natoms); /* should make this better for parallelizing? */
-            }
-            else
-            {
-                /* this is for NHC in the Ekin(t+dt/2) version of vv */
-                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
-                               trotter_seq, ettTSEQ1);
-            }
-
-            upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
-                              M, etrtVELOCITY1, cr, constr != nullptr);
-
-            wallcycle_stop(wcycle, ewcUPDATE);
-            constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
-            wallcycle_start(wcycle, ewcUPDATE);
-            /* if VV, compute the pressure and constraints */
-            /* For VV2, we strictly only need this if using pressure
-             * control, but we really would like to have accurate pressures
-             * printed out.
-             * Think about ways around this in the future?
-             * For now, keep this choice in comments.
-             */
-            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
-            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
-            bPres = TRUE;
-            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
-            if (bCalcEner && ir->eI == eiVVAK)
-            {
-                bSumEkinhOld = TRUE;
-            }
-            /* for vv, the first half of the integration actually corresponds to the previous step.
-               So we need information from the last step in the first half of the integration */
-            if (bGStat || do_per_step(step - 1, nstglobalcomm))
-            {
-                wallcycle_stop(wcycle, ewcUPDATE);
-                compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                                makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
-                                enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
-                                state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                                (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
-                                        | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
-                                        | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
-                                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
-                                                                                 : 0)
-                                        | CGLO_SCALEEKIN);
-                /* explanation of above:
-                   a) We compute Ekin at the full time step
-                   if 1) we are using the AveVel Ekin, and it's not the
-                   initial step, or 2) if we are using AveEkin, but need the full
-                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
-                   EkinAveVel because it's needed for the pressure */
-                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
-                                                top_global, &top, makeConstArrayRef(state->x),
-                                                state->box, &shouldCheckNumberOfBondedInteractions);
-                if (bStopCM)
-                {
-                    process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
-                                           makeArrayRef(state->v));
-                    inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
-                }
-                wallcycle_start(wcycle, ewcUPDATE);
-            }
-            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
-            if (!bInitStep)
-            {
-                if (bTrotter)
-                {
-                    m_add(force_vir, shake_vir,
-                          total_vir); /* we need the un-dispersion corrected total vir here */
-                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
-                                   trotter_seq, ettTSEQ2);
-
-                    /* TODO This is only needed when we're about to write
-                     * a checkpoint, because we use it after the restart
-                     * (in a kludge?). But what should we be doing if
-                     * the startingBehavior is NewSimulation or bInitStep are true? */
-                    if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
-                    {
-                        copy_mat(shake_vir, state->svir_prev);
-                        copy_mat(force_vir, state->fvir_prev);
-                    }
-                    if ((inputrecNptTrotter(ir) || inputrecNvtTrotter(ir)) && ir->eI == eiVV)
-                    {
-                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
-                        enerd->term[F_TEMP] =
-                                sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
-                        enerd->term[F_EKIN] = trace(ekind->ekin);
-                    }
-                }
-                else if (bExchanged)
-                {
-                    wallcycle_stop(wcycle, ewcUPDATE);
-                    /* We need the kinetic energy at minus the half step for determining
-                     * the full step kinetic energy and possibly for T-coupling.*/
-                    /* This may not be quite working correctly yet . . . . */
-                    compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                                    makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
-                                    enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
-                                    state->box, nullptr, &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
-                    wallcycle_start(wcycle, ewcUPDATE);
-                }
-            }
-            /* if it's the initial step, we performed this first step just to get the constraint virial */
-            if (ir->eI == eiVV && bInitStep)
-            {
-                copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
-                sfree(vbuf);
-            }
-            wallcycle_stop(wcycle, ewcUPDATE);
-        }
-
-        /* compute the conserved quantity */
-        if (EI_VV(ir->eI))
-        {
-            saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
-            if (ir->eI == eiVV)
-            {
-                last_ekin = enerd->term[F_EKIN];
-            }
-            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
-            {
-                saved_conserved_quantity -= enerd->term[F_DISPCORR];
-            }
-            /* sum up the foreign kinetic energy and dK/dl terms for vv.  currently done every step so that dhdl is correct in the .edr */
-            if (ir->efep != efepNO)
-            {
-                accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
-            }
-        }
-
-        /* ########  END FIRST UPDATE STEP  ############## */
-        /* ########  If doing VV, we now have v(dt) ###### */
-        if (bDoExpanded)
-        {
-            /* perform extended ensemble sampling in lambda - we don't
-               actually move to the new state before outputting
-               statistics, but if performing simulated tempering, we
-               do update the velocities and the tau_t. */
-
-            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
-                                              state->dfhist, step, state->v.rvec_array(), mdatoms, &realFepState);
-            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
-            if (MASTER(cr))
-            {
-                copy_df_history(state_global->dfhist, state->dfhist);
-            }
-        }
-
-        // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
-        // coordinates have not already been copied for i) search or ii) CPU force tasks.
-        if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
-            && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
-                || checkpointHandler->isCheckpointingStep()))
-        {
-            stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
-            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-        }
-        // Copy velocities if needed for the output/checkpointing.
-        // NOTE: Copy on the search steps is done at the beginning of the step.
-        if (useGpuForUpdate && !bNS
-            && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
-        {
-            stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-            stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-        }
-        // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
-        // and update is offloaded hence forces are kept on the GPU for update and have not been
-        // already transferred in do_force().
-        // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
-        //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
-        //       prior to GPU update.
-        // TODO: When the output flags will be included in step workload, this copy can be combined with the
-        //       copy call in do_force(...).
-        // NOTE: The forces should not be copied here if the vsites are present, since they were modified
-        //       on host after the D2H copy in do_force(...).
-        if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
-            && do_per_step(step, ir->nstfout))
-        {
-            stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
-            stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
-        }
-        /* Now we have the energies and forces corresponding to the
-         * coordinates at time t. We must output all of this before
-         * the update.
-         */
-        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
-                                 observablesHistory, top_global, fr, outf, energyOutput, ekind,
-                                 f.view().force(), checkpointHandler->isCheckpointingStep(),
-                                 bRerunMD, bLastStep, mdrunOptions.writeConfout, bSumEkinhOld);
-        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
-        bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
-
-        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
-        if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
-            && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
-        {
-            copy_mat(state->svir_prev, shake_vir);
-            copy_mat(state->fvir_prev, force_vir);
-        }
-
-        stopHandler->setSignal();
-        resetHandler->setSignal(walltime_accounting);
-
-        if (bGStat || !PAR(cr))
-        {
-            /* In parallel we only have to check for checkpointing in steps
-             * where we do global communication,
-             *  otherwise the other nodes don't know.
-             */
-            checkpointHandler->setSignal(walltime_accounting);
-        }
-
-        /* #########   START SECOND UPDATE STEP ################# */
-
-        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
-           controlled in preprocessing */
-
-        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
-        {
-            gmx_bool bIfRandomize;
-            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
-            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
-            if (constr && bIfRandomize)
-            {
-                constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
-            }
-        }
-        /* Box is changed in update() when we do pressure coupling,
-         * but we should still use the old box for energy corrections and when
-         * writing it to the energy file, so it matches the trajectory files for
-         * the same timestep above. Make a copy in a separate array.
-         */
-        copy_mat(state->box, lastbox);
-
-        dvdl_constr = 0;
-
-        if (!useGpuForUpdate)
-        {
-            wallcycle_start(wcycle, ewcUPDATE);
-        }
-        /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
-        if (bTrotter)
-        {
-            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
-            /* We can only do Berendsen coupling after we have summed
-             * the kinetic energy or virial. Since the happens
-             * in global_state after update, we should only do it at
-             * step % nstlist = 1 with bGStatEveryStep=FALSE.
-             */
-        }
-        else
-        {
-            update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-            update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
-        }
-
-        if (EI_VV(ir->eI))
-        {
-            /* velocity half-step update */
-            upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
-                              M, etrtVELOCITY2, cr, constr != nullptr);
-        }
-
-        /* Above, initialize just copies ekinh into ekin,
-         * it doesn't copy position (for VV),
-         * and entire integrator for MD.
-         */
-
-        if (ir->eI == eiVVAK)
-        {
-            cbuf.resize(state->x.size());
-            std::copy(state->x.begin(), state->x.end(), cbuf.begin());
-        }
-
-        /* With leap-frog type integrators we compute the kinetic energy
-         * at a whole time step as the average of the half-time step kinetic
-         * energies of two subsequent steps. Therefore we need to compute the
-         * half step kinetic energy also if we need energies at the next step.
-         */
-        const bool needHalfStepKineticEnergy =
-                (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
-
-        // Parrinello-Rahman requires the pressure to be availible before the update to compute
-        // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
-        const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
-                                         && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
-
-        if (useGpuForUpdate)
-        {
-            if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
-            {
-                integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
-                                stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
-
-                // Copy data to the GPU after buffers might have being reinitialized
-                stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
-                stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
-            }
-
-            if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
-                && !thisRankHasDuty(cr, DUTY_PME))
-            {
-                // The PME forces were recieved to the host, so have to be copied
-                stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
-            }
-            else if (!runScheduleWork->stepWork.useGpuFBufferOps)
-            {
-                // The buffer ops were not offloaded this step, so the forces are on the
-                // host and have to be copied
-                stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
-            }
-
-            const bool doTemperatureScaling =
-                    (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
-
-            // This applies Leap-Frog, LINCS and SETTLE in succession
-            integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
-                                          AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
-                                  ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
-                                  ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
-
-            // Copy velocities D2H after update if:
-            // - Globals are computed this step (includes the energy output steps).
-            // - Temperature is needed for the next step.
-            if (bGStat || needHalfStepKineticEnergy)
-            {
-                stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-                stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-            }
-        }
-        else
-        {
-            /* With multiple time stepping we need to do an additional normal
-             * update step to obtain the virial, as the actual MTS integration
-             * using an acceleration where the slow forces are multiplied by mtsFactor.
-             * Using that acceleration would result in a virial with the slow
-             * force contribution would be a factor mtsFactor too large.
-             */
-            if (fr->useMts && bCalcVir && constr != nullptr)
-            {
-                upd.update_for_constraint_virial(*ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
-
-                constrain_coordinates(constr, do_log, do_ene, step, state,
-                                      upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
-            }
-
-            ArrayRefWithPadding<const RVec> forceCombined =
-                    (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
-                            ? f.view().forceMtsCombinedWithPadding()
-                            : f.view().forceWithPadding();
-            upd.update_coords(*ir, step, mdatoms, state, forceCombined, fcdata, ekind, M,
-                              etrtPOSITION, cr, constr != nullptr);
-
-            wallcycle_stop(wcycle, ewcUPDATE);
-
-            constrain_coordinates(constr, do_log, do_ene, step, state, upd.xp()->arrayRefWithPadding(),
-                                  &dvdl_constr, bCalcVir && !fr->useMts, shake_vir);
-
-            upd.update_sd_second_half(*ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle,
-                                      constr, do_log, do_ene);
-            upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
-        }
-
-        if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
-        {
-            updatePrevStepPullCom(pull_work, state);
-        }
-
-        if (ir->eI == eiVVAK)
-        {
-            /* erase F_EKIN and F_TEMP here? */
-            /* just compute the kinetic energy at the half step to perform a trotter step */
-            compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                            makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle, enerd,
-                            force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, lastbox,
-                            nullptr, &bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
-            wallcycle_start(wcycle, ewcUPDATE);
-            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
-            /* now we know the scaling, we can compute the positions again */
-            std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
-
-            upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
-                              M, etrtPOSITION, cr, constr != nullptr);
-            wallcycle_stop(wcycle, ewcUPDATE);
-
-            /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
-            /* are the small terms in the shake_vir here due
-             * to numerical errors, or are they important
-             * physically? I'm thinking they are just errors, but not completely sure.
-             * For now, will call without actually constraining, constr=nullptr*/
-            upd.finish_update(*ir, mdatoms, state, wcycle, false);
-        }
-        if (EI_VV(ir->eI))
-        {
-            /* this factor or 2 correction is necessary
-               because half of the constraint force is removed
-               in the vv step, so we have to double it.  See
-               the Issue #1255.  It is not yet clear
-               if the factor of 2 is exact, or just a very
-               good approximation, and this will be
-               investigated.  The next step is to see if this
-               can be done adding a dhdl contribution from the
-               rattle step, but this is somewhat more
-               complicated with the current code. Will be
-               investigated, hopefully for 4.6.3. However,
-               this current solution is much better than
-               having it completely wrong.
-             */
-            enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
-        }
-        else
-        {
-            enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-        }
-
-        if (vsite != nullptr)
-        {
-            wallcycle_start(wcycle, ewcVSITECONSTR);
-            vsite->construct(state->x, ir->delta_t, state->v, state->box);
-            wallcycle_stop(wcycle, ewcVSITECONSTR);
-        }
-
-        /* ############## IF NOT VV, Calculate globals HERE  ############ */
-        /* With Leap-Frog we can skip compute_globals at
-         * non-communication steps, but we need to calculate
-         * the kinetic energy one step before communication.
-         */
-        {
-            // Organize to do inter-simulation signalling on steps if
-            // and when algorithms require it.
-            const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
-
-            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
-            {
-                // Copy coordinates when needed to stop the CM motion.
-                if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
-                {
-                    stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
-                    stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-                }
-                // Since we're already communicating at this step, we
-                // can propagate intra-simulation signals. Note that
-                // check_nstglobalcomm has the responsibility for
-                // choosing the value of nstglobalcomm that is one way
-                // bGStat becomes true, so we can't get into a
-                // situation where e.g. checkpointing can't be
-                // signalled.
-                bool                doIntraSimSignal = true;
-                SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
-
-                compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                                makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm,
-                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
-                                &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                                (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
-                                        | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-                                        | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-                                        | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
-                                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
-                                                                                 : 0));
-                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
-                                                top_global, &top, makeConstArrayRef(state->x),
-                                                state->box, &shouldCheckNumberOfBondedInteractions);
-                if (!EI_VV(ir->eI) && bStopCM)
-                {
-                    process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
-                                           makeArrayRef(state->v));
-                    inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
-
-                    // TODO: The special case of removing CM motion should be dealt more gracefully
-                    if (useGpuForUpdate)
-                    {
-                        stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
-                        // Here we block until the H2D copy completes because event sync with the
-                        // force kernels that use the coordinates on the next steps is not implemented
-                        // (not because of a race on state->x being modified on the CPU while H2D is in progress).
-                        stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
-                        // If the COM removal changed the velocities on the CPU, this has to be accounted for.
-                        if (vcm.mode != ecmNO)
-                        {
-                            stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
-                        }
-                    }
-                }
-            }
-        }
-
-        /* #############  END CALC EKIN AND PRESSURE ################# */
-
-        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-           the virial that should probably be addressed eventually. state->veta has better properies,
-           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
-
-        if (ir->efep != efepNO && !EI_VV(ir->eI))
-        {
-            /* Sum up the foreign energy and dK/dl terms for md and sd.
-               Currently done every step so that dH/dl is correct in the .edr */
-            accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
-        }
-
-        update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
-                                         pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
-
-        const bool doBerendsenPressureCoupling =
-                (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
-        const bool doCRescalePressureCoupling =
-                (inputrec->epc == epcCRESCALE && do_per_step(step, inputrec->nstpcouple));
-        if (useGpuForUpdate
-            && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
-        {
-            integrator->scaleCoordinates(pressureCouplingMu);
-            if (doCRescalePressureCoupling)
-            {
-                matrix pressureCouplingInvMu;
-                gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
-                integrator->scaleVelocities(pressureCouplingInvMu);
-            }
-            integrator->setPbc(PbcType::Xyz, state->box);
-        }
-
-        /* ################# END UPDATE STEP 2 ################# */
-        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
-
-        /* The coordinates (x) were unshifted in update */
-        if (!bGStat)
-        {
-            /* We will not sum ekinh_old,
-             * so signal that we still have to do it.
-             */
-            bSumEkinhOld = TRUE;
-        }
-
-        if (bCalcEner)
-        {
-            /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
-
-            /* use the directly determined last velocity, not actually the averaged half steps */
-            if (bTrotter && ir->eI == eiVV)
-            {
-                enerd->term[F_EKIN] = last_ekin;
-            }
-            enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
-
-            if (integratorHasConservedEnergyQuantity(ir))
-            {
-                if (EI_VV(ir->eI))
-                {
-                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
-                }
-                else
-                {
-                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
-                }
-            }
-            /* #########  END PREPARING EDR OUTPUT  ###########  */
-        }
-
-        /* Output stuff */
-        if (MASTER(cr))
-        {
-            if (fplog && do_log && bDoExpanded)
-            {
-                /* only needed if doing expanded ensemble */
-                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
-                                          ir->bSimTemp ? ir->simtempvals : nullptr,
-                                          state_global->dfhist, state->fep_state, ir->nstlog, step);
-            }
-            if (bCalcEner)
-            {
-                energyOutput.addDataAtEnergyStep(
-                        bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, ir->fepvals,
-                        ir->expandedvals, lastbox,
-                        PTCouplingArrays{ state->boxv, state->nosehoover_xi, state->nosehoover_vxi,
-                                          state->nhpres_xi, state->nhpres_vxi },
-                        state->fep_state, shake_vir, force_vir, total_vir, pres, ekind, mu_tot, constr);
-            }
-            else
-            {
-                energyOutput.recordNonEnergyStep();
-            }
-
-            gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
-            gmx_bool do_or = do_per_step(step, ir->nstorireout);
-
-            if (doSimulatedAnnealing)
-            {
-                gmx::EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups,
-                                                              &(ir->opts));
-            }
-            if (do_log || do_ene || do_dr || do_or)
-            {
-                energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
-                                                   do_log ? fplog : nullptr, step, t,
-                                                   fr->fcdata.get(), awh.get());
-            }
-            if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
-            {
-                const bool isInitialOutput = false;
-                printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
-            }
-
-            if (ir->bPull)
-            {
-                pull_print_output(pull_work, step, t);
-            }
-
-            if (do_per_step(step, ir->nstlog))
-            {
-                if (fflush(fplog) != 0)
-                {
-                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
-                }
-            }
-        }
-        if (bDoExpanded)
-        {
-            /* Have to do this part _after_ outputting the logfile and the edr file */
-            /* Gets written into the state at the beginning of next loop*/
-            state->fep_state = lamnew;
-            if(plumedswitch)
-            {
-                realFepState = state->fep_state;
-            }
-        }
-        else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
-        {
-            state->fep_state = awh->fepLambdaState();
-        }
-        /* Print the remaining wall clock time for the run */
-        if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
-        {
-            if (shellfc)
-            {
-                fprintf(stderr, "\n");
-            }
-            print_time(stderr, walltime_accounting, step, ir, cr);
-        }
-
-        /* Ion/water position swapping.
-         * Not done in last step since trajectory writing happens before this call
-         * in the MD loop and exchanges would be lost anyway. */
-        bNeedRepartition = FALSE;
-        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
-        {
-            bNeedRepartition =
-                    do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
-                                  state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
-
-            if (bNeedRepartition && DOMAINDECOMP(cr))
-            {
-                dd_collect_state(cr->dd, state, state_global);
-            }
-        }
-
-        /* Replica exchange */
-        bExchanged = FALSE;
-        if (bDoReplEx)
-        {
-            bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
-        }
-
-        if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
-        {
-            dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
-                                imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
-                                nrnb, wcycle, FALSE);
-            shouldCheckNumberOfBondedInteractions = true;
-            upd.setNumAtoms(state->natoms);
-        }
-
-        bFirstStep = FALSE;
-        bInitStep  = FALSE;
-
-        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
-        /* With all integrators, except VV, we need to retain the pressure
-         * at the current step for coupling at the next step.
-         */
-        if ((state->flags & (1U << estPRES_PREV))
-            && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
-        {
-            /* Store the pressure in t_state for pressure coupling
-             * at the next MD step.
-             */
-            copy_mat(pres, state->pres_prev);
-        }
-
-        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
-
-        if ((membed != nullptr) && (!bLastStep))
-        {
-            rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
-        }
-
-        cycles = wallcycle_stop(wcycle, ewcSTEP);
-        if (DOMAINDECOMP(cr) && wcycle)
-        {
-            dd_cycles_add(cr->dd, cycles, ddCyclStep);
-        }
-
-        /* increase the MD step number */
-        step++;
-        step_rel++;
-
-#if GMX_FAHCORE
-        if (MASTER(cr))
-        {
-            fcReportProgress(ir->nsteps + ir->init_step, step);
-        }
-#endif
-
-        resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
-                                    fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
-
-        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
-        imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
-    }
-    /* End of main MD loop */
-
-    /* Closing TNG files can include compressing data. Therefore it is good to do that
-     * before stopping the time measurements. */
-    mdoutf_tng_close(outf);
-
-    /* Stop measuring walltime */
-    walltime_accounting_end_time(walltime_accounting);
-
-    if (!thisRankHasDuty(cr, DUTY_PME))
-    {
-        /* Tell the PME only node to finish */
-        gmx_pme_send_finish(cr);
-    }
-
-    if (MASTER(cr))
-    {
-        if (ir->nstcalcenergy > 0)
-        {
-            energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
-
-            gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
-            energyOutput.printAverages(fplog, groups);
-        }
-    }
-    done_mdoutf(outf);
-
-    if (bPMETune)
-    {
-        pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
-    }
-
-    done_shellfc(fplog, shellfc, step_rel);
-
-    if (useReplicaExchange && MASTER(cr))
-    {
-        print_replica_exchange_statistics(fplog, repl_ex);
-    }
-
-    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
-
-    global_stat_destroy(gstat);
-}
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/md.cpp.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/md.cpp.preplumed
deleted file mode 100644
index bc367f6e57..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/md.cpp.preplumed
+++ /dev/null
@@ -1,1728 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief Implements the integrator for normal molecular dynamics simulations
- *
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-
-#include <algorithm>
-#include <memory>
-#include <numeric>
-
-#include "gromacs/applied_forces/awh/awh.h"
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/domdec/collect.h"
-#include "gromacs/domdec/dlbtiming.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_network.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/gpuhaloexchange.h"
-#include "gromacs/domdec/mdsetup.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/essentialdynamics/edsam.h"
-#include "gromacs/ewald/pme_load_balancing.h"
-#include "gromacs/ewald/pme_pp.h"
-#include "gromacs/fileio/trxio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gpu_utils/device_stream_manager.h"
-#include "gromacs/gpu_utils/gpu_utils.h"
-#include "gromacs/imd/imd.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/invertmatrix.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/math/vectypes.h"
-#include "gromacs/mdlib/checkpointhandler.h"
-#include "gromacs/mdlib/compute_io.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/coupling.h"
-#include "gromacs/mdlib/ebin.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/energyoutput.h"
-#include "gromacs/mdlib/expanded.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/freeenergyparameters.h"
-#include "gromacs/mdlib/md_support.h"
-#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/mdoutf.h"
-#include "gromacs/mdlib/membed.h"
-#include "gromacs/mdlib/resethandler.h"
-#include "gromacs/mdlib/sighandler.h"
-#include "gromacs/mdlib/simulationsignal.h"
-#include "gromacs/mdlib/stat.h"
-#include "gromacs/mdlib/stophandler.h"
-#include "gromacs/mdlib/tgroup.h"
-#include "gromacs/mdlib/trajectory_writing.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/update_constrain_gpu.h"
-#include "gromacs/mdlib/vcm.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdrunutility/handlerestart.h"
-#include "gromacs/mdrunutility/multisim.h"
-#include "gromacs/mdrunutility/printtime.h"
-#include "gromacs/mdtypes/awh_history.h"
-#include "gromacs/mdtypes/awh_params.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/df_history.h"
-#include "gromacs/mdtypes/energyhistory.h"
-#include "gromacs/mdtypes/fcdata.h"
-#include "gromacs/mdtypes/forcebuffers.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/group.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/interaction_const.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-#include "gromacs/mdtypes/multipletimestepping.h"
-#include "gromacs/mdtypes/observableshistory.h"
-#include "gromacs/mdtypes/pullhistory.h"
-#include "gromacs/mdtypes/simulation_workload.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/mdtypes/state_propagator_data_gpu.h"
-#include "gromacs/modularsimulator/energydata.h"
-#include "gromacs/nbnxm/gpu_data_mgmt.h"
-#include "gromacs/nbnxm/nbnxm.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pulling/output.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/swap/swapcoords.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/walltime_accounting.h"
-#include "gromacs/topology/atoms.h"
-#include "gromacs/topology/idef.h"
-#include "gromacs/topology/mtop_util.h"
-#include "gromacs/topology/topology.h"
-#include "gromacs/trajectory/trajectoryframe.h"
-#include "gromacs/utility/basedefinitions.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/real.h"
-#include "gromacs/utility/smalloc.h"
-
-#include "legacysimulator.h"
-#include "replicaexchange.h"
-#include "shellfc.h"
-
-using gmx::SimulationSignaller;
-
-void gmx::LegacySimulator::do_md()
-{
-    // TODO Historically, the EM and MD "integrators" used different
-    // names for the t_inputrec *parameter, but these must have the
-    // same name, now that it's a member of a struct. We use this ir
-    // alias to avoid a large ripple of nearly useless changes.
-    // t_inputrec is being replaced by IMdpOptionsProvider, so this
-    // will go away eventually.
-    t_inputrec*  ir = inputrec;
-    int64_t      step, step_rel;
-    double       t, t0 = ir->init_t;
-    gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
-    gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
-    gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
-    gmx_bool     do_ene, do_log, do_verbose;
-    gmx_bool     bMasterState;
-    unsigned int force_flags;
-    tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
-    int    i, m;
-    rvec   mu_tot;
-    matrix pressureCouplingMu, M;
-    gmx_repl_ex_t     repl_ex = nullptr;
-    gmx_global_stat_t gstat;
-    gmx_shellfc_t*    shellfc;
-    gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
-    gmx_bool          bTemp, bPres, bTrotter;
-    real              dvdl_constr;
-    std::vector<RVec> cbuf;
-    matrix            lastbox;
-    int               lamnew = 0;
-    /* for FEP */
-    int       nstfep = 0;
-    double    cycles;
-    real      saved_conserved_quantity = 0;
-    real      last_ekin                = 0;
-    t_extmass MassQ;
-    char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
-
-    /* PME load balancing data for GPU kernels */
-    gmx_bool bPMETune         = FALSE;
-    gmx_bool bPMETunePrinting = FALSE;
-
-    bool bInteractiveMDstep = false;
-
-    /* Domain decomposition could incorrectly miss a bonded
-       interaction, but checking for that requires a global
-       communication stage, which does not otherwise happen in DD
-       code. So we do that alongside the first global energy reduction
-       after a new DD is made. These variables handle whether the
-       check happens, and the result it returns. */
-    bool shouldCheckNumberOfBondedInteractions = false;
-    int  totalNumberOfBondedInteractions       = -1;
-
-    SimulationSignals signals;
-    // Most global communnication stages don't propagate mdrun
-    // signals, and will use this object to achieve that.
-    SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
-
-    if (!mdrunOptions.writeConfout)
-    {
-        // This is on by default, and the main known use case for
-        // turning it off is for convenience in benchmarking, which is
-        // something that should not show up in the general user
-        // interface.
-        GMX_LOG(mdlog.info)
-                .asParagraph()
-                .appendText(
-                        "The -noconfout functionality is deprecated, and may be removed in a "
-                        "future version.");
-    }
-
-    /* md-vv uses averaged full step velocities for T-control
-       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
-       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
-    bTrotter = (EI_VV(ir->eI)
-                && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
-
-    const bool bRerunMD = false;
-
-    int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
-    bGStatEveryStep   = (nstglobalcomm == 1);
-
-    const SimulationGroups* groups = &top_global->groups;
-
-    std::unique_ptr<EssentialDynamics> ed = nullptr;
-    if (opt2bSet("-ei", nfile, fnm))
-    {
-        /* Initialize essential dynamics sampling */
-        ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
-                        ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
-    }
-    else if (observablesHistory->edsamHistory)
-    {
-        gmx_fatal(FARGS,
-                  "The checkpoint is from a run with essential dynamics sampling, "
-                  "but the current run did not specify the -ei option. "
-                  "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
-    }
-
-    int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
-    gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
-    initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
-    Update     upd(*ir, deform);
-    const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
-    const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
-
-    const t_fcdata& fcdata = *fr->fcdata;
-
-    bool simulationsShareState = false;
-    int  nstSignalComm         = nstglobalcomm;
-    {
-        // TODO This implementation of ensemble orientation restraints is nasty because
-        // a user can't just do multi-sim with single-sim orientation restraints.
-        bool usingEnsembleRestraints =
-                (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
-        bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
-
-        // Replica exchange, ensemble restraints and AWH need all
-        // simulations to remain synchronized, so they need
-        // checkpoints and stop conditions to act on the same step, so
-        // the propagation of such signals must take place between
-        // simulations, not just within simulations.
-        // TODO: Make algorithm initializers set these flags.
-        simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
-
-        if (simulationsShareState)
-        {
-            // Inter-simulation signal communication does not need to happen
-            // often, so we use a minimum of 200 steps to reduce overhead.
-            const int c_minimumInterSimulationSignallingInterval = 200;
-            nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
-                            * nstglobalcomm;
-        }
-    }
-
-    if (startingBehavior != StartingBehavior::RestartWithAppending)
-    {
-        pleaseCiteCouplingAlgorithms(fplog, *ir);
-    }
-    gmx_mdoutf* outf =
-            init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
-                        top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
-                                   mdoutf_get_fp_dhdl(outf), false, startingBehavior,
-                                   simulationsShareState, mdModulesNotifier);
-
-    gstat = global_stat_init(ir);
-
-    const auto& simulationWork     = runScheduleWork->simulationWork;
-    const bool  useGpuForPme       = simulationWork.useGpuPme;
-    const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
-    const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
-    const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
-
-    /* Check for polarizable models and flexible constraints */
-    shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
-                                 ir->nstcalcenergy, DOMAINDECOMP(cr), useGpuForPme);
-
-    {
-        double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
-        if ((io > 2000) && MASTER(cr))
-        {
-            fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
-        }
-    }
-
-    // Local state only becomes valid now.
-    std::unique_ptr<t_state> stateInstance;
-    t_state*                 state;
-
-    gmx_localtop_t top(top_global->ffparams);
-
-    auto mdatoms = mdAtoms->mdatoms();
-
-    ForceBuffers f(fr->useMts, ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
-                                       ? PinningPolicy::PinnedIfSupported
-                                       : PinningPolicy::CannotBePinned);
-    if (DOMAINDECOMP(cr))
-    {
-        stateInstance = std::make_unique<t_state>();
-        state         = stateInstance.get();
-        dd_init_local_state(cr->dd, state_global, state);
-
-        /* Distribute the charge groups over the nodes from the master node */
-        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
-                            imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
-                            nrnb, nullptr, FALSE);
-        shouldCheckNumberOfBondedInteractions = true;
-        upd.setNumAtoms(state->natoms);
-    }
-    else
-    {
-        state_change_natoms(state_global, state_global->natoms);
-        /* Copy the pointer to the global state */
-        state = state_global;
-
-        /* Generate and initialize new topology */
-        mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
-
-        upd.setNumAtoms(state->natoms);
-    }
-
-    std::unique_ptr<UpdateConstrainGpu> integrator;
-
-    StatePropagatorDataGpu* stateGpu = fr->stateGpu;
-
-    // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
-    if (useGpuForUpdate)
-    {
-        GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
-                                   || constr->numConstraintsTotal() == 0,
-                           "Constraints in domain decomposition are only supported with update "
-                           "groups if using GPU update.\n");
-        GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
-                                   || constr->numConstraintsTotal() == 0,
-                           "SHAKE is not supported with GPU update.");
-        GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
-                           "Either PME or short-ranged non-bonded interaction tasks must run on "
-                           "the GPU to use GPU update.\n");
-        GMX_RELEASE_ASSERT(ir->eI == eiMD,
-                           "Only the md integrator is supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(
-                ir->etc != etcNOSEHOOVER,
-                "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(
-                ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN
-                        || ir->epc == epcCRESCALE,
-                "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
-                "with the GPU update.\n");
-        GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
-                           "Virtual sites are not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(ed == nullptr,
-                           "Essential dynamics is not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
-                           "Constraints pulling is not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
-                           "Orientation restraints are not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(
-                ir->efep == efepNO
-                        || (!haveFepPerturbedMasses(*top_global) && !havePerturbedConstraints(*top_global)),
-                "Free energy perturbation of masses and constraints are not supported with the GPU "
-                "update.");
-
-        if (constr != nullptr && constr->numConstraintsTotal() > 0)
-        {
-            GMX_LOG(mdlog.info)
-                    .asParagraph()
-                    .appendText("Updating coordinates and applying constraints on the GPU.");
-        }
-        else
-        {
-            GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
-        }
-        GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
-                           "Device stream manager should be initialized in order to use GPU "
-                           "update-constraints.");
-        GMX_RELEASE_ASSERT(
-                fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
-                "Update stream should be initialized in order to use GPU "
-                "update-constraints.");
-        integrator = std::make_unique<UpdateConstrainGpu>(
-                *ir, *top_global, fr->deviceStreamManager->context(),
-                fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
-                stateGpu->xUpdatedOnDevice(), wcycle);
-
-        integrator->setPbc(PbcType::Xyz, state->box);
-    }
-
-    if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
-    {
-        changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
-    }
-    if (useGpuForUpdate)
-    {
-        changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
-    }
-
-    // NOTE: The global state is no longer used at this point.
-    // But state_global is still used as temporary storage space for writing
-    // the global state to file and potentially for replica exchange.
-    // (Global topology should persist.)
-
-    update_mdatoms(mdatoms, state->lambda[efptMASS]);
-
-    if (ir->bExpanded)
-    {
-        /* Check nstexpanded here, because the grompp check was broken */
-        if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
-        {
-            gmx_fatal(FARGS,
-                      "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
-        }
-        init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
-    }
-
-    if (MASTER(cr))
-    {
-        EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
-    }
-
-    preparePrevStepPullCom(ir, pull_work, mdatoms->massT, state, state_global, cr,
-                           startingBehavior != StartingBehavior::NewSimulation);
-
-    // TODO: Remove this by converting AWH into a ForceProvider
-    auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
-                                startingBehavior != StartingBehavior::NewSimulation,
-                                shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
-
-    if (useReplicaExchange && MASTER(cr))
-    {
-        repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
-    }
-    /* PME tuning is only supported in the Verlet scheme, with PME for
-     * Coulomb. It is not supported with only LJ PME. */
-    bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
-                && ir->cutoff_scheme != ecutsGROUP);
-
-    pme_load_balancing_t* pme_loadbal = nullptr;
-    if (bPMETune)
-    {
-        pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
-                         fr->nbv->useGpu());
-    }
-
-    if (!ir->bContinuation)
-    {
-        if (state->flags & (1U << estV))
-        {
-            auto v = makeArrayRef(state->v);
-            /* Set the velocities of vsites, shells and frozen atoms to zero */
-            for (i = 0; i < mdatoms->homenr; i++)
-            {
-                if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
-                {
-                    clear_rvec(v[i]);
-                }
-                else if (mdatoms->cFREEZE)
-                {
-                    for (m = 0; m < DIM; m++)
-                    {
-                        if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
-                        {
-                            v[i][m] = 0;
-                        }
-                    }
-                }
-            }
-        }
-
-        if (constr)
-        {
-            /* Constrain the initial coordinates and velocities */
-            do_constrain_first(fplog, constr, ir, mdatoms->nr, mdatoms->homenr,
-                               state->x.arrayRefWithPadding(), state->v.arrayRefWithPadding(),
-                               state->box, state->lambda[efptBONDED]);
-        }
-        if (vsite)
-        {
-            /* Construct the virtual sites for the initial configuration */
-            vsite->construct(state->x, ir->delta_t, {}, state->box);
-        }
-    }
-
-    if (ir->efep != efepNO)
-    {
-        /* Set free energy calculation frequency as the greatest common
-         * denominator of nstdhdl and repl_ex_nst. */
-        nstfep = ir->fepvals->nstdhdl;
-        if (ir->bExpanded)
-        {
-            nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
-        }
-        if (useReplicaExchange)
-        {
-            nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
-        }
-        if (ir->bDoAwh)
-        {
-            nstfep = std::gcd(ir->awhParams->nstSampleCoord, nstfep);
-        }
-    }
-
-    /* Be REALLY careful about what flags you set here. You CANNOT assume
-     * this is the first step, since we might be restarting from a checkpoint,
-     * and in that case we should not do any modifications to the state.
-     */
-    bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
-
-    // When restarting from a checkpoint, it can be appropriate to
-    // initialize ekind from quantities in the checkpoint. Otherwise,
-    // compute_globals must initialize ekind before the simulation
-    // starts/restarts. However, only the master rank knows what was
-    // found in the checkpoint file, so we have to communicate in
-    // order to coordinate the restart.
-    //
-    // TODO Consider removing this communication if/when checkpoint
-    // reading directly follows .tpr reading, because all ranks can
-    // agree on hasReadEkinState at that time.
-    bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
-    if (PAR(cr))
-    {
-        gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
-    }
-    if (hasReadEkinState)
-    {
-        restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
-    }
-
-    unsigned int cglo_flags =
-            (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
-             | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
-
-    bSumEkinhOld = FALSE;
-
-    t_vcm vcm(top_global->groups, *ir);
-    reportComRemovalInfo(fplog, vcm);
-
-    /* To minimize communication, compute_globals computes the COM velocity
-     * and the kinetic energy for the velocities without COM motion removed.
-     * Thus to get the kinetic energy without the COM contribution, we need
-     * to call compute_globals twice.
-     */
-    for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
-    {
-        unsigned int cglo_flags_iteration = cglo_flags;
-        if (bStopCM && cgloIteration == 0)
-        {
-            cglo_flags_iteration |= CGLO_STOPCM;
-            cglo_flags_iteration &= ~CGLO_TEMPERATURE;
-        }
-        compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                        makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
-                        enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
-                        state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                        cglo_flags_iteration
-                                | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
-                                                                         : 0));
-        if (cglo_flags_iteration & CGLO_STOPCM)
-        {
-            /* At initialization, do not pass x with acceleration-correction mode
-             * to avoid (incorrect) correction of the initial coordinates.
-             */
-            auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
-                                                                     : makeArrayRef(state->x);
-            process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
-            inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
-        }
-    }
-    checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
-                                    makeConstArrayRef(state->x), state->box,
-                                    &shouldCheckNumberOfBondedInteractions);
-    if (ir->eI == eiVVAK)
-    {
-        /* a second call to get the half step temperature initialized as well */
-        /* we do the same call as above, but turn the pressure off -- internally to
-           compute_globals, this is recognized as a velocity verlet half-step
-           kinetic energy calculation.  This minimized excess variables, but
-           perhaps loses some logic?*/
-
-        compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                        makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
-                        enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
-                        state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
-    }
-
-    /* Calculate the initial half step temperature, and save the ekinh_old */
-    if (startingBehavior == StartingBehavior::NewSimulation)
-    {
-        for (i = 0; (i < ir->opts.ngtc); i++)
-        {
-            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
-        }
-    }
-
-    /* need to make an initiation call to get the Trotter variables set, as well as other constants
-       for non-trotter temperature control */
-    auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
-
-    if (MASTER(cr))
-    {
-        if (!ir->bContinuation)
-        {
-            if (constr && ir->eConstrAlg == econtLINCS)
-            {
-                fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
-                        constr->rmsd());
-            }
-            if (EI_STATE_VELOCITY(ir->eI))
-            {
-                real temp = enerd->term[F_TEMP];
-                if (ir->eI != eiVV)
-                {
-                    /* Result of Ekin averaged over velocities of -half
-                     * and +half step, while we only have -half step here.
-                     */
-                    temp *= 2;
-                }
-                fprintf(fplog, "Initial temperature: %g K\n", temp);
-            }
-        }
-
-        char tbuf[20];
-        fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
-        if (ir->nsteps >= 0)
-        {
-            sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
-        }
-        else
-        {
-            sprintf(tbuf, "%s", "infinite");
-        }
-        if (ir->init_step > 0)
-        {
-            fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
-                    gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
-                    gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
-        }
-        else
-        {
-            fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
-        }
-        fprintf(fplog, "\n");
-    }
-
-    walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
-    print_start(fplog, cr, walltime_accounting, "mdrun");
-
-    /***********************************************************
-     *
-     *             Loop over MD steps
-     *
-     ************************************************************/
-
-    bFirstStep = TRUE;
-    /* Skip the first Nose-Hoover integration when we get the state from tpx */
-    bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
-    bSumEkinhOld     = FALSE;
-    bExchanged       = FALSE;
-    bNeedRepartition = FALSE;
-
-    step     = ir->init_step;
-    step_rel = 0;
-
-    auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
-            compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
-            MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
-            mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
-
-    auto checkpointHandler = std::make_unique<CheckpointHandler>(
-            compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
-            ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
-            mdrunOptions.checkpointOptions.period);
-
-    const bool resetCountersIsLocal = true;
-    auto       resetHandler         = std::make_unique<ResetHandler>(
-            compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
-            !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
-            mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
-
-    const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
-
-    if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
-    {
-        logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
-    }
-
-    /* and stop now if we should */
-    bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
-    while (!bLastStep)
-    {
-
-        /* Determine if this is a neighbor search step */
-        bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
-
-        if (bPMETune && bNStList)
-        {
-            // This has to be here because PME load balancing is called so early.
-            // TODO: Move to after all booleans are defined.
-            if (useGpuForUpdate && !bFirstStep)
-            {
-                stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
-                stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-            }
-            /* PME grid + cut-off optimization with GPUs or PME nodes */
-            pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
-                           fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
-                           &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
-        }
-
-        wallcycle_start(wcycle, ewcSTEP);
-
-        bLastStep = (step_rel == ir->nsteps);
-        t         = t0 + step * ir->delta_t;
-
-        // TODO Refactor this, so that nstfep does not need a default value of zero
-        if (ir->efep != efepNO || ir->bSimTemp)
-        {
-            /* find and set the current lambdas */
-            state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
-
-            bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
-            bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
-            bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
-                           && (!bFirstStep));
-        }
-
-        bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
-                     && do_per_step(step, replExParams.exchangeInterval));
-
-        if (doSimulatedAnnealing)
-        {
-            update_annealing_target_temp(ir, t, &upd);
-        }
-
-        /* Stop Center of Mass motion */
-        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
-
-        /* Determine whether or not to do Neighbour Searching */
-        bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
-
-        /* Note that the stopHandler will cause termination at nstglobalcomm
-         * steps. Since this concides with nstcalcenergy, nsttcouple and/or
-         * nstpcouple steps, we have computed the half-step kinetic energy
-         * of the previous step and can always output energies at the last step.
-         */
-        bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
-
-        /* do_log triggers energy and virial calculation. Because this leads
-         * to different code paths, forces can be different. Thus for exact
-         * continuation we should avoid extra log output.
-         * Note that the || bLastStep can result in non-exact continuation
-         * beyond the last step. But we don't consider that to be an issue.
-         */
-        do_log     = (do_per_step(step, ir->nstlog)
-                  || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
-        do_verbose = mdrunOptions.verbose
-                     && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
-
-        if (useGpuForUpdate && !bFirstStep && bNS)
-        {
-            // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
-            stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-            stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-            // Copy coordinate from the GPU when needed at the search step.
-            // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
-            // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
-            stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
-            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-        }
-
-        if (bNS && !(bFirstStep && ir->bContinuation))
-        {
-            bMasterState = FALSE;
-            /* Correct the new box if it is too skewed */
-            if (inputrecDynamicBox(ir))
-            {
-                if (correct_box(fplog, step, state->box))
-                {
-                    bMasterState = TRUE;
-                    // If update is offloaded, it should be informed about the box size change
-                    if (useGpuForUpdate)
-                    {
-                        integrator->setPbc(PbcType::Xyz, state->box);
-                    }
-                }
-            }
-            if (DOMAINDECOMP(cr) && bMasterState)
-            {
-                dd_collect_state(cr->dd, state, state_global);
-            }
-
-            if (DOMAINDECOMP(cr))
-            {
-                /* Repartition the domain decomposition */
-                dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
-                                    *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
-                                    fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
-                shouldCheckNumberOfBondedInteractions = true;
-                upd.setNumAtoms(state->natoms);
-            }
-        }
-
-        // Allocate or re-size GPU halo exchange object, if necessary
-        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
-        {
-            GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
-                               "GPU device manager has to be initialized to use GPU "
-                               "version of halo exchange.");
-            constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
-        }
-
-        if (MASTER(cr) && do_log)
-        {
-            gmx::EnergyOutput::printHeader(fplog, step,
-                                           t); /* can we improve the information printed here? */
-        }
-
-        if (ir->efep != efepNO)
-        {
-            update_mdatoms(mdatoms, state->lambda[efptMASS]);
-        }
-
-        if (bExchanged)
-        {
-
-            /* We need the kinetic energy at minus the half step for determining
-             * the full step kinetic energy and possibly for T-coupling.*/
-            /* This may not be quite working correctly yet . . . . */
-            compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                            makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
-                            enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
-                            state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                            CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
-            checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
-                                            &top, makeConstArrayRef(state->x), state->box,
-                                            &shouldCheckNumberOfBondedInteractions);
-        }
-        clear_mat(force_vir);
-
-        checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
-
-        /* Determine the energy and pressure:
-         * at nstcalcenergy steps and at energy output steps (set below).
-         */
-        if (EI_VV(ir->eI) && (!bInitStep))
-        {
-            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
-            bCalcVir      = bCalcEnerStep
-                       || (ir->epc != epcNO
-                           && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
-        }
-        else
-        {
-            bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
-            bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
-        }
-        bCalcEner = bCalcEnerStep;
-
-        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
-
-        if (do_ene || do_log || bDoReplEx)
-        {
-            bCalcVir  = TRUE;
-            bCalcEner = TRUE;
-        }
-
-        /* Do we need global communication ? */
-        bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
-                  || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
-
-        force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
-                       | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
-                       | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
-        if (fr->useMts && !do_per_step(step, ir->nstfout))
-        {
-            force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
-        }
-
-        if (shellfc)
-        {
-            /* Now is the time to relax the shells */
-            relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
-                                imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
-                                state->natoms, state->x.arrayRefWithPadding(),
-                                state->v.arrayRefWithPadding(), state->box, state->lambda,
-                                &state->hist, &f.view(), force_vir, mdatoms, nrnb, wcycle, shellfc,
-                                fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
-        }
-        else
-        {
-            /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
-               is updated (or the AWH update will be performed twice for one step when continuing).
-               It would be best to call this update function from do_md_trajectory_writing but that
-               would occur after do_force. One would have to divide the update_awh function into one
-               function applying the AWH force and one doing the AWH bias update. The update AWH
-               bias function could then be called after do_md_trajectory_writing (then containing
-               update_awh_history). The checkpointing will in the future probably moved to the start
-               of the md loop which will rid of this issue. */
-            if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
-            {
-                awh->updateHistory(state_global->awhHistory.get());
-            }
-
-            /* The coordinates (x) are shifted (to get whole molecules)
-             * in do_force.
-             * This is parallellized as well, and does communication too.
-             * Check comments in sim_util.c
-             */
-            do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
-                     nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
-                     &f.view(), force_vir, mdatoms, enerd, state->lambda, fr, runScheduleWork,
-                     vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
-                     (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
-        }
-
-        // VV integrators do not need the following velocity half step
-        // if it is the first step after starting from a checkpoint.
-        // That is, the half step is needed on all other steps, and
-        // also the first step when starting from a .tpr file.
-        if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
-        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
-        {
-            rvec* vbuf = nullptr;
-
-            wallcycle_start(wcycle, ewcUPDATE);
-            if (ir->eI == eiVV && bInitStep)
-            {
-                /* if using velocity verlet with full time step Ekin,
-                 * take the first half step only to compute the
-                 * virial for the first step. From there,
-                 * revert back to the initial coordinates
-                 * so that the input is actually the initial step.
-                 */
-                snew(vbuf, state->natoms);
-                copy_rvecn(state->v.rvec_array(), vbuf, 0,
-                           state->natoms); /* should make this better for parallelizing? */
-            }
-            else
-            {
-                /* this is for NHC in the Ekin(t+dt/2) version of vv */
-                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
-                               trotter_seq, ettTSEQ1);
-            }
-
-            upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
-                              M, etrtVELOCITY1, cr, constr != nullptr);
-
-            wallcycle_stop(wcycle, ewcUPDATE);
-            constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
-            wallcycle_start(wcycle, ewcUPDATE);
-            /* if VV, compute the pressure and constraints */
-            /* For VV2, we strictly only need this if using pressure
-             * control, but we really would like to have accurate pressures
-             * printed out.
-             * Think about ways around this in the future?
-             * For now, keep this choice in comments.
-             */
-            /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
-            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
-            bPres = TRUE;
-            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
-            if (bCalcEner && ir->eI == eiVVAK)
-            {
-                bSumEkinhOld = TRUE;
-            }
-            /* for vv, the first half of the integration actually corresponds to the previous step.
-               So we need information from the last step in the first half of the integration */
-            if (bGStat || do_per_step(step - 1, nstglobalcomm))
-            {
-                wallcycle_stop(wcycle, ewcUPDATE);
-                compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                                makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
-                                enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
-                                state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                                (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
-                                        | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
-                                        | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
-                                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
-                                                                                 : 0)
-                                        | CGLO_SCALEEKIN);
-                /* explanation of above:
-                   a) We compute Ekin at the full time step
-                   if 1) we are using the AveVel Ekin, and it's not the
-                   initial step, or 2) if we are using AveEkin, but need the full
-                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
-                   EkinAveVel because it's needed for the pressure */
-                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
-                                                top_global, &top, makeConstArrayRef(state->x),
-                                                state->box, &shouldCheckNumberOfBondedInteractions);
-                if (bStopCM)
-                {
-                    process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
-                                           makeArrayRef(state->v));
-                    inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
-                }
-                wallcycle_start(wcycle, ewcUPDATE);
-            }
-            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
-            if (!bInitStep)
-            {
-                if (bTrotter)
-                {
-                    m_add(force_vir, shake_vir,
-                          total_vir); /* we need the un-dispersion corrected total vir here */
-                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
-                                   trotter_seq, ettTSEQ2);
-
-                    /* TODO This is only needed when we're about to write
-                     * a checkpoint, because we use it after the restart
-                     * (in a kludge?). But what should we be doing if
-                     * the startingBehavior is NewSimulation or bInitStep are true? */
-                    if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
-                    {
-                        copy_mat(shake_vir, state->svir_prev);
-                        copy_mat(force_vir, state->fvir_prev);
-                    }
-                    if ((inputrecNptTrotter(ir) || inputrecNvtTrotter(ir)) && ir->eI == eiVV)
-                    {
-                        /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
-                        enerd->term[F_TEMP] =
-                                sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
-                        enerd->term[F_EKIN] = trace(ekind->ekin);
-                    }
-                }
-                else if (bExchanged)
-                {
-                    wallcycle_stop(wcycle, ewcUPDATE);
-                    /* We need the kinetic energy at minus the half step for determining
-                     * the full step kinetic energy and possibly for T-coupling.*/
-                    /* This may not be quite working correctly yet . . . . */
-                    compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                                    makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
-                                    enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
-                                    state->box, nullptr, &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
-                    wallcycle_start(wcycle, ewcUPDATE);
-                }
-            }
-            /* if it's the initial step, we performed this first step just to get the constraint virial */
-            if (ir->eI == eiVV && bInitStep)
-            {
-                copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
-                sfree(vbuf);
-            }
-            wallcycle_stop(wcycle, ewcUPDATE);
-        }
-
-        /* compute the conserved quantity */
-        if (EI_VV(ir->eI))
-        {
-            saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
-            if (ir->eI == eiVV)
-            {
-                last_ekin = enerd->term[F_EKIN];
-            }
-            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
-            {
-                saved_conserved_quantity -= enerd->term[F_DISPCORR];
-            }
-            /* sum up the foreign kinetic energy and dK/dl terms for vv.  currently done every step so that dhdl is correct in the .edr */
-            if (ir->efep != efepNO)
-            {
-                accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
-            }
-        }
-
-        /* ########  END FIRST UPDATE STEP  ############## */
-        /* ########  If doing VV, we now have v(dt) ###### */
-        if (bDoExpanded)
-        {
-            /* perform extended ensemble sampling in lambda - we don't
-               actually move to the new state before outputting
-               statistics, but if performing simulated tempering, we
-               do update the velocities and the tau_t. */
-
-            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
-                                              state->dfhist, step, state->v.rvec_array(), mdatoms);
-            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
-            if (MASTER(cr))
-            {
-                copy_df_history(state_global->dfhist, state->dfhist);
-            }
-        }
-
-        // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
-        // coordinates have not already been copied for i) search or ii) CPU force tasks.
-        if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
-            && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
-                || checkpointHandler->isCheckpointingStep()))
-        {
-            stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
-            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-        }
-        // Copy velocities if needed for the output/checkpointing.
-        // NOTE: Copy on the search steps is done at the beginning of the step.
-        if (useGpuForUpdate && !bNS
-            && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
-        {
-            stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-            stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-        }
-        // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
-        // and update is offloaded hence forces are kept on the GPU for update and have not been
-        // already transferred in do_force().
-        // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
-        //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
-        //       prior to GPU update.
-        // TODO: When the output flags will be included in step workload, this copy can be combined with the
-        //       copy call in do_force(...).
-        // NOTE: The forces should not be copied here if the vsites are present, since they were modified
-        //       on host after the D2H copy in do_force(...).
-        if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
-            && do_per_step(step, ir->nstfout))
-        {
-            stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
-            stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
-        }
-        /* Now we have the energies and forces corresponding to the
-         * coordinates at time t. We must output all of this before
-         * the update.
-         */
-        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
-                                 observablesHistory, top_global, fr, outf, energyOutput, ekind,
-                                 f.view().force(), checkpointHandler->isCheckpointingStep(),
-                                 bRerunMD, bLastStep, mdrunOptions.writeConfout, bSumEkinhOld);
-        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
-        bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
-
-        /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
-        if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
-            && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
-        {
-            copy_mat(state->svir_prev, shake_vir);
-            copy_mat(state->fvir_prev, force_vir);
-        }
-
-        stopHandler->setSignal();
-        resetHandler->setSignal(walltime_accounting);
-
-        if (bGStat || !PAR(cr))
-        {
-            /* In parallel we only have to check for checkpointing in steps
-             * where we do global communication,
-             *  otherwise the other nodes don't know.
-             */
-            checkpointHandler->setSignal(walltime_accounting);
-        }
-
-        /* #########   START SECOND UPDATE STEP ################# */
-
-        /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
-           controlled in preprocessing */
-
-        if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
-        {
-            gmx_bool bIfRandomize;
-            bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
-            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
-            if (constr && bIfRandomize)
-            {
-                constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
-            }
-        }
-        /* Box is changed in update() when we do pressure coupling,
-         * but we should still use the old box for energy corrections and when
-         * writing it to the energy file, so it matches the trajectory files for
-         * the same timestep above. Make a copy in a separate array.
-         */
-        copy_mat(state->box, lastbox);
-
-        dvdl_constr = 0;
-
-        if (!useGpuForUpdate)
-        {
-            wallcycle_start(wcycle, ewcUPDATE);
-        }
-        /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
-        if (bTrotter)
-        {
-            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
-            /* We can only do Berendsen coupling after we have summed
-             * the kinetic energy or virial. Since the happens
-             * in global_state after update, we should only do it at
-             * step % nstlist = 1 with bGStatEveryStep=FALSE.
-             */
-        }
-        else
-        {
-            update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-            update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
-        }
-
-        if (EI_VV(ir->eI))
-        {
-            /* velocity half-step update */
-            upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
-                              M, etrtVELOCITY2, cr, constr != nullptr);
-        }
-
-        /* Above, initialize just copies ekinh into ekin,
-         * it doesn't copy position (for VV),
-         * and entire integrator for MD.
-         */
-
-        if (ir->eI == eiVVAK)
-        {
-            cbuf.resize(state->x.size());
-            std::copy(state->x.begin(), state->x.end(), cbuf.begin());
-        }
-
-        /* With leap-frog type integrators we compute the kinetic energy
-         * at a whole time step as the average of the half-time step kinetic
-         * energies of two subsequent steps. Therefore we need to compute the
-         * half step kinetic energy also if we need energies at the next step.
-         */
-        const bool needHalfStepKineticEnergy =
-                (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
-
-        // Parrinello-Rahman requires the pressure to be availible before the update to compute
-        // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
-        const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
-                                         && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
-
-        if (useGpuForUpdate)
-        {
-            if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
-            {
-                integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
-                                stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
-
-                // Copy data to the GPU after buffers might have being reinitialized
-                stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
-                stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
-            }
-
-            if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
-                && !thisRankHasDuty(cr, DUTY_PME))
-            {
-                // The PME forces were recieved to the host, so have to be copied
-                stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
-            }
-            else if (!runScheduleWork->stepWork.useGpuFBufferOps)
-            {
-                // The buffer ops were not offloaded this step, so the forces are on the
-                // host and have to be copied
-                stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
-            }
-
-            const bool doTemperatureScaling =
-                    (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
-
-            // This applies Leap-Frog, LINCS and SETTLE in succession
-            integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
-                                          AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
-                                  ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
-                                  ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
-
-            // Copy velocities D2H after update if:
-            // - Globals are computed this step (includes the energy output steps).
-            // - Temperature is needed for the next step.
-            if (bGStat || needHalfStepKineticEnergy)
-            {
-                stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-                stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-            }
-        }
-        else
-        {
-            /* With multiple time stepping we need to do an additional normal
-             * update step to obtain the virial, as the actual MTS integration
-             * using an acceleration where the slow forces are multiplied by mtsFactor.
-             * Using that acceleration would result in a virial with the slow
-             * force contribution would be a factor mtsFactor too large.
-             */
-            if (fr->useMts && bCalcVir && constr != nullptr)
-            {
-                upd.update_for_constraint_virial(*ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
-
-                constrain_coordinates(constr, do_log, do_ene, step, state,
-                                      upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
-            }
-
-            ArrayRefWithPadding<const RVec> forceCombined =
-                    (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
-                            ? f.view().forceMtsCombinedWithPadding()
-                            : f.view().forceWithPadding();
-            upd.update_coords(*ir, step, mdatoms, state, forceCombined, fcdata, ekind, M,
-                              etrtPOSITION, cr, constr != nullptr);
-
-            wallcycle_stop(wcycle, ewcUPDATE);
-
-            constrain_coordinates(constr, do_log, do_ene, step, state, upd.xp()->arrayRefWithPadding(),
-                                  &dvdl_constr, bCalcVir && !fr->useMts, shake_vir);
-
-            upd.update_sd_second_half(*ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle,
-                                      constr, do_log, do_ene);
-            upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
-        }
-
-        if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
-        {
-            updatePrevStepPullCom(pull_work, state);
-        }
-
-        if (ir->eI == eiVVAK)
-        {
-            /* erase F_EKIN and F_TEMP here? */
-            /* just compute the kinetic energy at the half step to perform a trotter step */
-            compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                            makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle, enerd,
-                            force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, lastbox,
-                            nullptr, &bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
-            wallcycle_start(wcycle, ewcUPDATE);
-            trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
-            /* now we know the scaling, we can compute the positions again */
-            std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
-
-            upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
-                              M, etrtPOSITION, cr, constr != nullptr);
-            wallcycle_stop(wcycle, ewcUPDATE);
-
-            /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
-            /* are the small terms in the shake_vir here due
-             * to numerical errors, or are they important
-             * physically? I'm thinking they are just errors, but not completely sure.
-             * For now, will call without actually constraining, constr=NULL*/
-            upd.finish_update(*ir, mdatoms, state, wcycle, false);
-        }
-        if (EI_VV(ir->eI))
-        {
-            /* this factor or 2 correction is necessary
-               because half of the constraint force is removed
-               in the vv step, so we have to double it.  See
-               the Issue #1255.  It is not yet clear
-               if the factor of 2 is exact, or just a very
-               good approximation, and this will be
-               investigated.  The next step is to see if this
-               can be done adding a dhdl contribution from the
-               rattle step, but this is somewhat more
-               complicated with the current code. Will be
-               investigated, hopefully for 4.6.3. However,
-               this current solution is much better than
-               having it completely wrong.
-             */
-            enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
-        }
-        else
-        {
-            enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-        }
-
-        if (vsite != nullptr)
-        {
-            wallcycle_start(wcycle, ewcVSITECONSTR);
-            vsite->construct(state->x, ir->delta_t, state->v, state->box);
-            wallcycle_stop(wcycle, ewcVSITECONSTR);
-        }
-
-        /* ############## IF NOT VV, Calculate globals HERE  ############ */
-        /* With Leap-Frog we can skip compute_globals at
-         * non-communication steps, but we need to calculate
-         * the kinetic energy one step before communication.
-         */
-        {
-            // Organize to do inter-simulation signalling on steps if
-            // and when algorithms require it.
-            const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
-
-            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
-            {
-                // Copy coordinates when needed to stop the CM motion.
-                if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
-                {
-                    stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
-                    stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-                }
-                // Since we're already communicating at this step, we
-                // can propagate intra-simulation signals. Note that
-                // check_nstglobalcomm has the responsibility for
-                // choosing the value of nstglobalcomm that is one way
-                // bGStat becomes true, so we can't get into a
-                // situation where e.g. checkpointing can't be
-                // signalled.
-                bool                doIntraSimSignal = true;
-                SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
-
-                compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                                makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm,
-                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
-                                &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                                (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
-                                        | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-                                        | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-                                        | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
-                                        | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
-                                                                                 : 0));
-                checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
-                                                top_global, &top, makeConstArrayRef(state->x),
-                                                state->box, &shouldCheckNumberOfBondedInteractions);
-                if (!EI_VV(ir->eI) && bStopCM)
-                {
-                    process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
-                                           makeArrayRef(state->v));
-                    inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
-
-                    // TODO: The special case of removing CM motion should be dealt more gracefully
-                    if (useGpuForUpdate)
-                    {
-                        stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
-                        // Here we block until the H2D copy completes because event sync with the
-                        // force kernels that use the coordinates on the next steps is not implemented
-                        // (not because of a race on state->x being modified on the CPU while H2D is in progress).
-                        stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
-                        // If the COM removal changed the velocities on the CPU, this has to be accounted for.
-                        if (vcm.mode != ecmNO)
-                        {
-                            stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
-                        }
-                    }
-                }
-            }
-        }
-
-        /* #############  END CALC EKIN AND PRESSURE ################# */
-
-        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-           the virial that should probably be addressed eventually. state->veta has better properies,
-           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
-
-        if (ir->efep != efepNO && !EI_VV(ir->eI))
-        {
-            /* Sum up the foreign energy and dK/dl terms for md and sd.
-               Currently done every step so that dH/dl is correct in the .edr */
-            accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
-        }
-
-        update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
-                                         pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
-
-        const bool doBerendsenPressureCoupling =
-                (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
-        const bool doCRescalePressureCoupling =
-                (inputrec->epc == epcCRESCALE && do_per_step(step, inputrec->nstpcouple));
-        if (useGpuForUpdate
-            && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
-        {
-            integrator->scaleCoordinates(pressureCouplingMu);
-            if (doCRescalePressureCoupling)
-            {
-                matrix pressureCouplingInvMu;
-                gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
-                integrator->scaleVelocities(pressureCouplingInvMu);
-            }
-            integrator->setPbc(PbcType::Xyz, state->box);
-        }
-
-        /* ################# END UPDATE STEP 2 ################# */
-        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
-
-        /* The coordinates (x) were unshifted in update */
-        if (!bGStat)
-        {
-            /* We will not sum ekinh_old,
-             * so signal that we still have to do it.
-             */
-            bSumEkinhOld = TRUE;
-        }
-
-        if (bCalcEner)
-        {
-            /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
-
-            /* use the directly determined last velocity, not actually the averaged half steps */
-            if (bTrotter && ir->eI == eiVV)
-            {
-                enerd->term[F_EKIN] = last_ekin;
-            }
-            enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
-
-            if (integratorHasConservedEnergyQuantity(ir))
-            {
-                if (EI_VV(ir->eI))
-                {
-                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
-                }
-                else
-                {
-                    enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
-                }
-            }
-            /* #########  END PREPARING EDR OUTPUT  ###########  */
-        }
-
-        /* Output stuff */
-        if (MASTER(cr))
-        {
-            if (fplog && do_log && bDoExpanded)
-            {
-                /* only needed if doing expanded ensemble */
-                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
-                                          ir->bSimTemp ? ir->simtempvals : nullptr,
-                                          state_global->dfhist, state->fep_state, ir->nstlog, step);
-            }
-            if (bCalcEner)
-            {
-                energyOutput.addDataAtEnergyStep(
-                        bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, ir->fepvals,
-                        ir->expandedvals, lastbox,
-                        PTCouplingArrays{ state->boxv, state->nosehoover_xi, state->nosehoover_vxi,
-                                          state->nhpres_xi, state->nhpres_vxi },
-                        state->fep_state, shake_vir, force_vir, total_vir, pres, ekind, mu_tot, constr);
-            }
-            else
-            {
-                energyOutput.recordNonEnergyStep();
-            }
-
-            gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
-            gmx_bool do_or = do_per_step(step, ir->nstorireout);
-
-            if (doSimulatedAnnealing)
-            {
-                gmx::EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups,
-                                                              &(ir->opts));
-            }
-            if (do_log || do_ene || do_dr || do_or)
-            {
-                energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
-                                                   do_log ? fplog : nullptr, step, t,
-                                                   fr->fcdata.get(), awh.get());
-            }
-            if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
-            {
-                const bool isInitialOutput = false;
-                printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
-            }
-
-            if (ir->bPull)
-            {
-                pull_print_output(pull_work, step, t);
-            }
-
-            if (do_per_step(step, ir->nstlog))
-            {
-                if (fflush(fplog) != 0)
-                {
-                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
-                }
-            }
-        }
-        if (bDoExpanded)
-        {
-            /* Have to do this part _after_ outputting the logfile and the edr file */
-            /* Gets written into the state at the beginning of next loop*/
-            state->fep_state = lamnew;
-        }
-        else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
-        {
-            state->fep_state = awh->fepLambdaState();
-        }
-        /* Print the remaining wall clock time for the run */
-        if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
-        {
-            if (shellfc)
-            {
-                fprintf(stderr, "\n");
-            }
-            print_time(stderr, walltime_accounting, step, ir, cr);
-        }
-
-        /* Ion/water position swapping.
-         * Not done in last step since trajectory writing happens before this call
-         * in the MD loop and exchanges would be lost anyway. */
-        bNeedRepartition = FALSE;
-        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
-        {
-            bNeedRepartition =
-                    do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
-                                  state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
-
-            if (bNeedRepartition && DOMAINDECOMP(cr))
-            {
-                dd_collect_state(cr->dd, state, state_global);
-            }
-        }
-
-        /* Replica exchange */
-        bExchanged = FALSE;
-        if (bDoReplEx)
-        {
-            bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
-        }
-
-        if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
-        {
-            dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
-                                imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
-                                nrnb, wcycle, FALSE);
-            shouldCheckNumberOfBondedInteractions = true;
-            upd.setNumAtoms(state->natoms);
-        }
-
-        bFirstStep = FALSE;
-        bInitStep  = FALSE;
-
-        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
-        /* With all integrators, except VV, we need to retain the pressure
-         * at the current step for coupling at the next step.
-         */
-        if ((state->flags & (1U << estPRES_PREV))
-            && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
-        {
-            /* Store the pressure in t_state for pressure coupling
-             * at the next MD step.
-             */
-            copy_mat(pres, state->pres_prev);
-        }
-
-        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
-
-        if ((membed != nullptr) && (!bLastStep))
-        {
-            rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
-        }
-
-        cycles = wallcycle_stop(wcycle, ewcSTEP);
-        if (DOMAINDECOMP(cr) && wcycle)
-        {
-            dd_cycles_add(cr->dd, cycles, ddCyclStep);
-        }
-
-        /* increase the MD step number */
-        step++;
-        step_rel++;
-
-#if GMX_FAHCORE
-        if (MASTER(cr))
-        {
-            fcReportProgress(ir->nsteps + ir->init_step, step);
-        }
-#endif
-
-        resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
-                                    fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
-
-        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
-        imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
-    }
-    /* End of main MD loop */
-
-    /* Closing TNG files can include compressing data. Therefore it is good to do that
-     * before stopping the time measurements. */
-    mdoutf_tng_close(outf);
-
-    /* Stop measuring walltime */
-    walltime_accounting_end_time(walltime_accounting);
-
-    if (!thisRankHasDuty(cr, DUTY_PME))
-    {
-        /* Tell the PME only node to finish */
-        gmx_pme_send_finish(cr);
-    }
-
-    if (MASTER(cr))
-    {
-        if (ir->nstcalcenergy > 0)
-        {
-            energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
-
-            gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
-            energyOutput.printAverages(fplog, groups);
-        }
-    }
-    done_mdoutf(outf);
-
-    if (bPMETune)
-    {
-        pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
-    }
-
-    done_shellfc(fplog, shellfc, step_rel);
-
-    if (useReplicaExchange && MASTER(cr))
-    {
-        print_replica_exchange_statistics(fplog, repl_ex);
-    }
-
-    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
-
-    global_stat_destroy(gstat);
-}
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/minimize.cpp b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/minimize.cpp
deleted file mode 100644
index 0d628b98d4..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/minimize.cpp
+++ /dev/null
@@ -1,2958 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017 The GROMACS development team.
- * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief This file defines integrators for energy minimization
- *
- * \author Berk Hess <hess@kth.se>
- * \author Erik Lindahl <erik@kth.se>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include "config.h"
-
-#include <cmath>
-#include <cstring>
-#include <ctime>
-
-#include <algorithm>
-#include <limits>
-#include <vector>
-
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/domdec/collect.h"
-#include "gromacs/domdec/dlbtiming.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/mdsetup.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/ewald/pme_pp.h"
-#include "gromacs/fileio/confio.h"
-#include "gromacs/fileio/mtxio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/imd/imd.h"
-#include "gromacs/linearalgebra/sparsematrix.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/coupling.h"
-#include "gromacs/mdlib/dispersioncorrection.h"
-#include "gromacs/mdlib/ebin.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/energyoutput.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/md_support.h"
-#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/stat.h"
-#include "gromacs/mdlib/tgroup.h"
-#include "gromacs/mdlib/trajectory_writing.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdrunutility/handlerestart.h"
-#include "gromacs/mdrunutility/multisim.h" /*PLUMED*/
-#include "gromacs/mdrunutility/printtime.h"
-#include "gromacs/mdtypes/checkpointdata.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/forcebuffers.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/interaction_const.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/walltime_accounting.h"
-#include "gromacs/topology/mtop_util.h"
-#include "gromacs/topology/topology.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/exceptions.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/smalloc.h"
-
-#include "legacysimulator.h"
-#include "shellfc.h"
-
-using gmx::ArrayRef;
-using gmx::MdrunScheduleWorkload;
-using gmx::RVec;
-using gmx::VirtualSitesHandler;
-
-/* PLUMED */
-#include "../../../Plumed.h"
-extern int    plumedswitch;
-extern plumed plumedmain;
-/* END PLUMED */
-
-//! Utility structure for manipulating states during EM
-typedef struct em_state
-{
-    //! Copy of the global state
-    t_state s;
-    //! Force array
-    gmx::ForceBuffers f;
-    //! Potential energy
-    real epot;
-    //! Norm of the force
-    real fnorm;
-    //! Maximum force
-    real fmax;
-    //! Direction
-    int a_fmax;
-} em_state_t;
-
-//! Print the EM starting conditions
-static void print_em_start(FILE*                     fplog,
-                           const t_commrec*          cr,
-                           gmx_walltime_accounting_t walltime_accounting,
-                           gmx_wallcycle_t           wcycle,
-                           const char*               name)
-{
-    walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
-    print_start(fplog, cr, walltime_accounting, name);
-}
-
-//! Stop counting time for EM
-static void em_time_end(gmx_walltime_accounting_t walltime_accounting, gmx_wallcycle_t wcycle)
-{
-    wallcycle_stop(wcycle, ewcRUN);
-
-    walltime_accounting_end_time(walltime_accounting);
-}
-
-//! Printing a log file and console header
-static void sp_header(FILE* out, const char* minimizer, real ftol, int nsteps)
-{
-    fprintf(out, "\n");
-    fprintf(out, "%s:\n", minimizer);
-    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
-    fprintf(out, "   Number of steps    = %12d\n", nsteps);
-}
-
-//! Print warning message
-static void warn_step(FILE* fp, real ftol, real fmax, gmx_bool bLastStep, gmx_bool bConstrain)
-{
-    constexpr bool realIsDouble = GMX_DOUBLE;
-    char           buffer[2048];
-
-    if (!std::isfinite(fmax))
-    {
-        sprintf(buffer,
-                "\nEnergy minimization has stopped because the force "
-                "on at least one atom is not finite. This usually means "
-                "atoms are overlapping. Modify the input coordinates to "
-                "remove atom overlap or use soft-core potentials with "
-                "the free energy code to avoid infinite forces.\n%s",
-                !realIsDouble ? "You could also be lucky that switching to double precision "
-                                "is sufficient to obtain finite forces.\n"
-                              : "");
-    }
-    else if (bLastStep)
-    {
-        sprintf(buffer,
-                "\nEnergy minimization reached the maximum number "
-                "of steps before the forces reached the requested "
-                "precision Fmax < %g.\n",
-                ftol);
-    }
-    else
-    {
-        sprintf(buffer,
-                "\nEnergy minimization has stopped, but the forces have "
-                "not converged to the requested precision Fmax < %g (which "
-                "may not be possible for your system). It stopped "
-                "because the algorithm tried to make a new step whose size "
-                "was too small, or there was no change in the energy since "
-                "last step. Either way, we regard the minimization as "
-                "converged to within the available machine precision, "
-                "given your starting configuration and EM parameters.\n%s%s",
-                ftol,
-                !realIsDouble ? "\nDouble precision normally gives you higher accuracy, but "
-                                "this is often not needed for preparing to run molecular "
-                                "dynamics.\n"
-                              : "",
-                bConstrain ? "You might need to increase your constraint accuracy, or turn\n"
-                             "off constraints altogether (set constraints = none in mdp file)\n"
-                           : "");
-    }
-
-    fputs(wrap_lines(buffer, 78, 0, FALSE), stderr);
-    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
-}
-
-//! Print message about convergence of the EM
-static void print_converged(FILE*             fp,
-                            const char*       alg,
-                            real              ftol,
-                            int64_t           count,
-                            gmx_bool          bDone,
-                            int64_t           nsteps,
-                            const em_state_t* ems,
-                            double            sqrtNumAtoms)
-{
-    char buf[STEPSTRSIZE];
-
-    if (bDone)
-    {
-        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n", alg, ftol, gmx_step_str(count, buf));
-    }
-    else if (count < nsteps)
-    {
-        fprintf(fp,
-                "\n%s converged to machine precision in %s steps,\n"
-                "but did not reach the requested Fmax < %g.\n",
-                alg, gmx_step_str(count, buf), ftol);
-    }
-    else
-    {
-        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n", alg, ftol,
-                gmx_step_str(count, buf));
-    }
-
-#if GMX_DOUBLE
-    fprintf(fp, "Potential Energy  = %21.14e\n", ems->epot);
-    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", ems->fmax, ems->a_fmax + 1);
-    fprintf(fp, "Norm of force     = %21.14e\n", ems->fnorm / sqrtNumAtoms);
-#else
-    fprintf(fp, "Potential Energy  = %14.7e\n", ems->epot);
-    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", ems->fmax, ems->a_fmax + 1);
-    fprintf(fp, "Norm of force     = %14.7e\n", ems->fnorm / sqrtNumAtoms);
-#endif
-}
-
-//! Compute the norm and max of the force array in parallel
-static void get_f_norm_max(const t_commrec*               cr,
-                           t_grpopts*                     opts,
-                           t_mdatoms*                     mdatoms,
-                           gmx::ArrayRef<const gmx::RVec> f,
-                           real*                          fnorm,
-                           real*                          fmax,
-                           int*                           a_fmax)
-{
-    double fnorm2, *sum;
-    real   fmax2, fam;
-    int    la_max, a_max, start, end, i, m, gf;
-
-    /* This routine finds the largest force and returns it.
-     * On parallel machines the global max is taken.
-     */
-    fnorm2 = 0;
-    fmax2  = 0;
-    la_max = -1;
-    start  = 0;
-    end    = mdatoms->homenr;
-    if (mdatoms->cFREEZE)
-    {
-        for (i = start; i < end; i++)
-        {
-            gf  = mdatoms->cFREEZE[i];
-            fam = 0;
-            for (m = 0; m < DIM; m++)
-            {
-                if (!opts->nFreeze[gf][m])
-                {
-                    fam += gmx::square(f[i][m]);
-                }
-            }
-            fnorm2 += fam;
-            if (fam > fmax2)
-            {
-                fmax2  = fam;
-                la_max = i;
-            }
-        }
-    }
-    else
-    {
-        for (i = start; i < end; i++)
-        {
-            fam = norm2(f[i]);
-            fnorm2 += fam;
-            if (fam > fmax2)
-            {
-                fmax2  = fam;
-                la_max = i;
-            }
-        }
-    }
-
-    if (la_max >= 0 && DOMAINDECOMP(cr))
-    {
-        a_max = cr->dd->globalAtomIndices[la_max];
-    }
-    else
-    {
-        a_max = la_max;
-    }
-    if (PAR(cr))
-    {
-        snew(sum, 2 * cr->nnodes + 1);
-        sum[2 * cr->nodeid]     = fmax2;
-        sum[2 * cr->nodeid + 1] = a_max;
-        sum[2 * cr->nnodes]     = fnorm2;
-        gmx_sumd(2 * cr->nnodes + 1, sum, cr);
-        fnorm2 = sum[2 * cr->nnodes];
-        /* Determine the global maximum */
-        for (i = 0; i < cr->nnodes; i++)
-        {
-            if (sum[2 * i] > fmax2)
-            {
-                fmax2 = sum[2 * i];
-                a_max = gmx::roundToInt(sum[2 * i + 1]);
-            }
-        }
-        sfree(sum);
-    }
-
-    if (fnorm)
-    {
-        *fnorm = sqrt(fnorm2);
-    }
-    if (fmax)
-    {
-        *fmax = sqrt(fmax2);
-    }
-    if (a_fmax)
-    {
-        *a_fmax = a_max;
-    }
-}
-
-//! Compute the norm of the force
-static void get_state_f_norm_max(const t_commrec* cr, t_grpopts* opts, t_mdatoms* mdatoms, em_state_t* ems)
-{
-    get_f_norm_max(cr, opts, mdatoms, ems->f.view().force(), &ems->fnorm, &ems->fmax, &ems->a_fmax);
-}
-
-//! Initialize the energy minimization
-static void init_em(FILE*                fplog,
-                    const gmx::MDLogger& mdlog,
-                    const char*          title,
-                    const t_commrec*     cr,
-                    const gmx_multisim_t *ms, /* PLUMED */
-                    t_inputrec*          ir,
-                    gmx::ImdSession*     imdSession,
-                    pull_t*              pull_work,
-                    t_state*             state_global,
-                    const gmx_mtop_t*    top_global,
-                    em_state_t*          ems,
-                    gmx_localtop_t*      top,
-                    t_nrnb*              nrnb,
-                    t_forcerec*          fr,
-                    gmx::MDAtoms*        mdAtoms,
-                    gmx_global_stat_t*   gstat,
-                    VirtualSitesHandler* vsite,
-                    gmx::Constraints*    constr,
-                    gmx_shellfc_t**      shellfc)
-{
-    real dvdl_constr;
-
-    if (fplog)
-    {
-        fprintf(fplog, "Initiating %s\n", title);
-    }
-
-    if (MASTER(cr))
-    {
-        state_global->ngtc = 0;
-    }
-    int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
-    gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
-    initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
-
-    if (ir->eI == eiNM)
-    {
-        GMX_ASSERT(shellfc != nullptr, "With NM we always support shells");
-
-        *shellfc =
-                init_shell_flexcon(stdout, top_global, constr ? constr->numFlexibleConstraints() : 0,
-                                   ir->nstcalcenergy, DOMAINDECOMP(cr), thisRankHasDuty(cr, DUTY_PME));
-    }
-    else
-    {
-        GMX_ASSERT(EI_ENERGY_MINIMIZATION(ir->eI),
-                   "This else currently only handles energy minimizers, consider if your algorithm "
-                   "needs shell/flexible-constraint support");
-
-        /* With energy minimization, shells and flexible constraints are
-         * automatically minimized when treated like normal DOFS.
-         */
-        if (shellfc != nullptr)
-        {
-            *shellfc = nullptr;
-        }
-    }
-
-    if (DOMAINDECOMP(cr))
-    {
-        dd_init_local_state(cr->dd, state_global, &ems->s);
-
-        /* Distribute the charge groups over the nodes from the master node */
-        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
-                            imdSession, pull_work, &ems->s, &ems->f, mdAtoms, top, fr, vsite,
-                            constr, nrnb, nullptr, FALSE);
-        dd_store_state(cr->dd, &ems->s);
-    }
-    else
-    {
-        state_change_natoms(state_global, state_global->natoms);
-        /* Just copy the state */
-        ems->s = *state_global;
-        state_change_natoms(&ems->s, ems->s.natoms);
-
-        mdAlgorithmsSetupAtomData(cr, ir, *top_global, top, fr, &ems->f, mdAtoms, constr, vsite,
-                                  shellfc ? *shellfc : nullptr);
-    }
-
-    update_mdatoms(mdAtoms->mdatoms(), ems->s.lambda[efptMASS]);
-
-    if (constr)
-    {
-        // TODO how should this cross-module support dependency be managed?
-        if (ir->eConstrAlg == econtSHAKE && gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
-        {
-            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
-                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
-        }
-
-        if (!ir->bContinuation)
-        {
-            /* Constrain the starting coordinates */
-            bool needsLogging  = true;
-            bool computeEnergy = true;
-            bool computeVirial = false;
-            dvdl_constr        = 0;
-            constr->apply(needsLogging, computeEnergy, -1, 0, 1.0, ems->s.x.arrayRefWithPadding(),
-                          ems->s.x.arrayRefWithPadding(), ArrayRef<RVec>(), ems->s.box,
-                          ems->s.lambda[efptFEP], &dvdl_constr, gmx::ArrayRefWithPadding<RVec>(),
-                          computeVirial, nullptr, gmx::ConstraintVariable::Positions);
-        }
-    }
-
-    if (PAR(cr))
-    {
-        *gstat = global_stat_init(ir);
-    }
-    else
-    {
-        *gstat = nullptr;
-    }
-
-    calc_shifts(ems->s.box, fr->shift_vec);
-
-    /* PLUMED */
-    if(plumedswitch){
-      if(ms && ms->numSimulations_>1) {
-        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&ms->mastersComm_);
-        if(PAR(cr)){
-          if(DOMAINDECOMP(cr)) {
-            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-          }else{
-            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-          }
-        }
-        plumed_cmd(plumedmain,"GREX init",nullptr);
-      }
-      if(PAR(cr)){
-        if(DOMAINDECOMP(cr)) {
-          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-        }else{
-          plumed_cmd(plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
-        }
-      }
-      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
-      plumed_cmd(plumedmain,"setMDEngine","gromacs");
-      plumed_cmd(plumedmain,"setLog",fplog);
-      real real_delta_t;
-      real_delta_t=ir->delta_t;
-      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
-      plumed_cmd(plumedmain,"init",nullptr);
-
-      if(PAR(cr)){
-        if(DOMAINDECOMP(cr)) {
-          int nat_home = dd_numHomeAtoms(*cr->dd);
-          plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
-          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
-
-        }
-      }
-    }
-    /* END PLUMED */
-}
-
-//! Finalize the minimization
-static void finish_em(const t_commrec*          cr,
-                      gmx_mdoutf_t              outf,
-                      gmx_walltime_accounting_t walltime_accounting,
-                      gmx_wallcycle_t           wcycle)
-{
-    if (!thisRankHasDuty(cr, DUTY_PME))
-    {
-        /* Tell the PME only node to finish */
-        gmx_pme_send_finish(cr);
-    }
-
-    done_mdoutf(outf);
-
-    em_time_end(walltime_accounting, wcycle);
-}
-
-//! Swap two different EM states during minimization
-static void swap_em_state(em_state_t** ems1, em_state_t** ems2)
-{
-    em_state_t* tmp;
-
-    tmp   = *ems1;
-    *ems1 = *ems2;
-    *ems2 = tmp;
-}
-
-//! Save the EM trajectory
-static void write_em_traj(FILE*               fplog,
-                          const t_commrec*    cr,
-                          gmx_mdoutf_t        outf,
-                          gmx_bool            bX,
-                          gmx_bool            bF,
-                          const char*         confout,
-                          const gmx_mtop_t*   top_global,
-                          t_inputrec*         ir,
-                          int64_t             step,
-                          em_state_t*         state,
-                          t_state*            state_global,
-                          ObservablesHistory* observablesHistory)
-{
-    int mdof_flags = 0;
-
-    if (bX)
-    {
-        mdof_flags |= MDOF_X;
-    }
-    if (bF)
-    {
-        mdof_flags |= MDOF_F;
-    }
-
-    /* If we want IMD output, set appropriate MDOF flag */
-    if (ir->bIMD)
-    {
-        mdof_flags |= MDOF_IMD;
-    }
-
-    gmx::WriteCheckpointDataHolder checkpointDataHolder;
-    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags, top_global->natoms, step,
-                                     static_cast<double>(step), &state->s, state_global,
-                                     observablesHistory, state->f.view().force(), &checkpointDataHolder);
-
-    if (confout != nullptr)
-    {
-        if (DOMAINDECOMP(cr))
-        {
-            /* If bX=true, x was collected to state_global in the call above */
-            if (!bX)
-            {
-                auto globalXRef = MASTER(cr) ? state_global->x : gmx::ArrayRef<gmx::RVec>();
-                dd_collect_vec(cr->dd, state->s.ddp_count, state->s.ddp_count_cg_gl, state->s.cg_gl,
-                               state->s.x, globalXRef);
-            }
-        }
-        else
-        {
-            /* Copy the local state pointer */
-            state_global = &state->s;
-        }
-
-        if (MASTER(cr))
-        {
-            if (ir->pbcType != PbcType::No && !ir->bPeriodicMols && DOMAINDECOMP(cr))
-            {
-                /* Make molecules whole only for confout writing */
-                do_pbc_mtop(ir->pbcType, state->s.box, top_global, state_global->x.rvec_array());
-            }
-
-            write_sto_conf_mtop(confout, *top_global->name, top_global,
-                                state_global->x.rvec_array(), nullptr, ir->pbcType, state->s.box);
-        }
-    }
-}
-
-//! \brief Do one minimization step
-//
-// \returns true when the step succeeded, false when a constraint error occurred
-static bool do_em_step(const t_commrec*                          cr,
-                       t_inputrec*                               ir,
-                       t_mdatoms*                                md,
-                       em_state_t*                               ems1,
-                       real                                      a,
-                       gmx::ArrayRefWithPadding<const gmx::RVec> force,
-                       em_state_t*                               ems2,
-                       gmx::Constraints*                         constr,
-                       int64_t                                   count)
-
-{
-    t_state *s1, *s2;
-    int      start, end;
-    real     dvdl_constr;
-    int nthreads gmx_unused;
-
-    bool validStep = true;
-
-    s1 = &ems1->s;
-    s2 = &ems2->s;
-
-    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
-    {
-        gmx_incons("state mismatch in do_em_step");
-    }
-
-    s2->flags = s1->flags;
-
-    if (s2->natoms != s1->natoms)
-    {
-        state_change_natoms(s2, s1->natoms);
-        ems2->f.resize(s2->natoms);
-    }
-    if (DOMAINDECOMP(cr) && s2->cg_gl.size() != s1->cg_gl.size())
-    {
-        s2->cg_gl.resize(s1->cg_gl.size());
-    }
-
-    copy_mat(s1->box, s2->box);
-    /* Copy free energy state */
-    s2->lambda = s1->lambda;
-    copy_mat(s1->box, s2->box);
-
-    start = 0;
-    end   = md->homenr;
-
-    nthreads = gmx_omp_nthreads_get(emntUpdate);
-#pragma omp parallel num_threads(nthreads)
-    {
-        const rvec* x1 = s1->x.rvec_array();
-        rvec*       x2 = s2->x.rvec_array();
-        const rvec* f  = as_rvec_array(force.unpaddedArrayRef().data());
-
-        int gf = 0;
-#pragma omp for schedule(static) nowait
-        for (int i = start; i < end; i++)
-        {
-            try
-            {
-                if (md->cFREEZE)
-                {
-                    gf = md->cFREEZE[i];
-                }
-                for (int m = 0; m < DIM; m++)
-                {
-                    if (ir->opts.nFreeze[gf][m])
-                    {
-                        x2[i][m] = x1[i][m];
-                    }
-                    else
-                    {
-                        x2[i][m] = x1[i][m] + a * f[i][m];
-                    }
-                }
-            }
-            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-        }
-
-        if (s2->flags & (1 << estCGP))
-        {
-            /* Copy the CG p vector */
-            const rvec* p1 = s1->cg_p.rvec_array();
-            rvec*       p2 = s2->cg_p.rvec_array();
-#pragma omp for schedule(static) nowait
-            for (int i = start; i < end; i++)
-            {
-                // Trivial OpenMP block that does not throw
-                copy_rvec(p1[i], p2[i]);
-            }
-        }
-
-        if (DOMAINDECOMP(cr))
-        {
-            /* OpenMP does not supported unsigned loop variables */
-#pragma omp for schedule(static) nowait
-            for (gmx::index i = 0; i < gmx::ssize(s2->cg_gl); i++)
-            {
-                s2->cg_gl[i] = s1->cg_gl[i];
-            }
-        }
-    }
-
-    if (DOMAINDECOMP(cr))
-    {
-        s2->ddp_count       = s1->ddp_count;
-        s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
-    }
-
-    if (constr)
-    {
-        dvdl_constr = 0;
-        validStep   = constr->apply(
-                TRUE, TRUE, count, 0, 1.0, s1->x.arrayRefWithPadding(), s2->x.arrayRefWithPadding(),
-                ArrayRef<RVec>(), s2->box, s2->lambda[efptBONDED], &dvdl_constr,
-                gmx::ArrayRefWithPadding<RVec>(), false, nullptr, gmx::ConstraintVariable::Positions);
-
-        if (cr->nnodes > 1)
-        {
-            /* This global reduction will affect performance at high
-             * parallelization, but we can not really avoid it.
-             * But usually EM is not run at high parallelization.
-             */
-            int reductionBuffer = static_cast<int>(!validStep);
-            gmx_sumi(1, &reductionBuffer, cr);
-            validStep = (reductionBuffer == 0);
-        }
-
-        // We should move this check to the different minimizers
-        if (!validStep && ir->eI != eiSteep)
-        {
-            gmx_fatal(FARGS,
-                      "The coordinates could not be constrained. Minimizer '%s' can not handle "
-                      "constraint failures, use minimizer '%s' before using '%s'.",
-                      EI(ir->eI), EI(eiSteep), EI(ir->eI));
-        }
-    }
-
-    return validStep;
-}
-
-//! Prepare EM for using domain decomposition parallellization
-static void em_dd_partition_system(FILE*                fplog,
-                                   const gmx::MDLogger& mdlog,
-                                   int                  step,
-                                   const t_commrec*     cr,
-                                   const gmx_mtop_t*    top_global,
-                                   t_inputrec*          ir,
-                                   gmx::ImdSession*     imdSession,
-                                   pull_t*              pull_work,
-                                   em_state_t*          ems,
-                                   gmx_localtop_t*      top,
-                                   gmx::MDAtoms*        mdAtoms,
-                                   t_forcerec*          fr,
-                                   VirtualSitesHandler* vsite,
-                                   gmx::Constraints*    constr,
-                                   t_nrnb*              nrnb,
-                                   gmx_wallcycle_t      wcycle)
-{
-    /* Repartition the domain decomposition */
-    dd_partition_system(fplog, mdlog, step, cr, FALSE, 1, nullptr, *top_global, ir, imdSession, pull_work,
-                        &ems->s, &ems->f, mdAtoms, top, fr, vsite, constr, nrnb, wcycle, FALSE);
-    dd_store_state(cr->dd, &ems->s);
-}
-
-namespace
-{
-
-/*! \brief Class to handle the work of setting and doing an energy evaluation.
- *
- * This class is a mere aggregate of parameters to pass to evaluate an
- * energy, so that future changes to names and types of them consume
- * less time when refactoring other code.
- *
- * Aggregate initialization is used, for which the chief risk is that
- * if a member is added at the end and not all initializer lists are
- * updated, then the member will be value initialized, which will
- * typically mean initialization to zero.
- *
- * Use a braced initializer list to construct one of these. */
-class EnergyEvaluator
-{
-public:
-    /*! \brief Evaluates an energy on the state in \c ems.
-     *
-     * \todo In practice, the same objects mu_tot, vir, and pres
-     * are always passed to this function, so we would rather have
-     * them as data members. However, their C-array types are
-     * unsuited for aggregate initialization. When the types
-     * improve, the call signature of this method can be reduced.
-     */
-    void run(em_state_t* ems, rvec mu_tot, tensor vir, tensor pres, int64_t count, gmx_bool bFirst);
-    //! Handles logging (deprecated).
-    FILE* fplog;
-    //! Handles logging.
-    const gmx::MDLogger& mdlog;
-    //! Handles communication.
-    const t_commrec* cr;
-    //! Coordinates multi-simulations.
-    const gmx_multisim_t* ms;
-    //! Holds the simulation topology.
-    const gmx_mtop_t* top_global;
-    //! Holds the domain topology.
-    gmx_localtop_t* top;
-    //! User input options.
-    t_inputrec* inputrec;
-    //! The Interactive Molecular Dynamics session.
-    gmx::ImdSession* imdSession;
-    //! The pull work object.
-    pull_t* pull_work;
-    //! Manages flop accounting.
-    t_nrnb* nrnb;
-    //! Manages wall cycle accounting.
-    gmx_wallcycle_t wcycle;
-    //! Coordinates global reduction.
-    gmx_global_stat_t gstat;
-    //! Handles virtual sites.
-    VirtualSitesHandler* vsite;
-    //! Handles constraints.
-    gmx::Constraints* constr;
-    //! Per-atom data for this domain.
-    gmx::MDAtoms* mdAtoms;
-    //! Handles how to calculate the forces.
-    t_forcerec* fr;
-    //! Schedule of force-calculation work each step for this task.
-    MdrunScheduleWorkload* runScheduleWork;
-    //! Stores the computed energies.
-    gmx_enerdata_t* enerd;
-};
-
-void EnergyEvaluator::run(em_state_t* ems, rvec mu_tot, tensor vir, tensor pres, int64_t count, gmx_bool bFirst)
-{
-    real     t;
-    gmx_bool bNS;
-    tensor   force_vir, shake_vir, ekin;
-    real     dvdl_constr;
-    real     terminate = 0;
-
-    /* Set the time to the initial time, the time does not change during EM */
-    t = inputrec->init_t;
-
-    if (bFirst || (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
-    {
-        /* This is the first state or an old state used before the last ns */
-        bNS = TRUE;
-    }
-    else
-    {
-        bNS = FALSE;
-        if (inputrec->nstlist > 0)
-        {
-            bNS = TRUE;
-        }
-    }
-
-    if (vsite)
-    {
-        vsite->construct(ems->s.x, 1, {}, ems->s.box);
-    }
-
-    if (DOMAINDECOMP(cr) && bNS)
-    {
-        /* Repartition the domain decomposition */
-        em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec, imdSession, pull_work,
-                               ems, top, mdAtoms, fr, vsite, constr, nrnb, wcycle);
-    }
-
-    /* Calc force & energy on new trial position  */
-    /* do_force always puts the charge groups in the box and shifts again
-     * We do not unshift, so molecules are always whole in congrad.c
-     */
-    /* PLUMED */
-    int plumedNeedsEnergy=0;
-    matrix plumed_vir;
-    if(plumedswitch){
-      long int lstep=count; plumed_cmd(plumedmain,"setStepLong",&lstep);
-      plumed_cmd(plumedmain,"setPositions",&ems->s.x[0][0]);
-      plumed_cmd(plumedmain,"setMasses",&mdAtoms->mdatoms()->massT[0]);
-      plumed_cmd(plumedmain,"setCharges",&mdAtoms->mdatoms()->chargeA[0]);
-      plumed_cmd(plumedmain,"setBox",&ems->s.box[0][0]);
-      plumed_cmd(plumedmain,"prepareCalc",nullptr);
-      plumed_cmd(plumedmain,"setForces",&ems->f.view().force()[0][0]);
-      plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-      clear_mat(plumed_vir);
-      plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
-    }
-    /* END PLUMED */
-
-    do_force(fplog, cr, ms, inputrec, nullptr, nullptr, imdSession, pull_work, count, nrnb, wcycle,
-             top, ems->s.box, ems->s.x.arrayRefWithPadding(), &ems->s.hist, &ems->f.view(), force_vir,
-             mdAtoms->mdatoms(), enerd, ems->s.lambda, fr, runScheduleWork, vsite, mu_tot, t, nullptr,
-             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES | GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY
-                     | (bNS ? GMX_FORCE_NS : 0),
-             DDBalanceRegionHandler(cr));
-
-    /* PLUMED */
-    if(plumedswitch){
-      if(plumedNeedsEnergy) {
-        msmul(force_vir,2.0,plumed_vir);
-        plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-        plumed_cmd(plumedmain,"performCalc",nullptr);
-        msmul(plumed_vir,0.5,force_vir);
-      } else {
-        msmul(plumed_vir,0.5,plumed_vir);
-        m_add(force_vir,plumed_vir,force_vir);
-      }
-    }
-    /* END PLUMED */
-
-    /* Clear the unused shake virial and pressure */
-    clear_mat(shake_vir);
-    clear_mat(pres);
-
-    /* Communicate stuff when parallel */
-    if (PAR(cr) && inputrec->eI != eiNM)
-    {
-        wallcycle_start(wcycle, ewcMoveE);
-
-        global_stat(gstat, cr, enerd, force_vir, shake_vir, inputrec, nullptr, nullptr, nullptr, 1,
-                    &terminate, nullptr, FALSE, CGLO_ENERGY | CGLO_PRESSURE | CGLO_CONSTRAINT);
-
-        wallcycle_stop(wcycle, ewcMoveE);
-    }
-
-    if (fr->dispersionCorrection)
-    {
-        /* Calculate long range corrections to pressure and energy */
-        const DispersionCorrection::Correction correction =
-                fr->dispersionCorrection->calculate(ems->s.box, ems->s.lambda[efptVDW]);
-
-        enerd->term[F_DISPCORR] = correction.energy;
-        enerd->term[F_EPOT] += correction.energy;
-        enerd->term[F_PRES] += correction.pressure;
-        enerd->term[F_DVDL] += correction.dvdl;
-    }
-    else
-    {
-        enerd->term[F_DISPCORR] = 0;
-    }
-
-    ems->epot = enerd->term[F_EPOT];
-
-    if (constr)
-    {
-        /* Project out the constraint components of the force */
-        bool needsLogging  = false;
-        bool computeEnergy = false;
-        bool computeVirial = true;
-        dvdl_constr        = 0;
-        auto f             = ems->f.view().forceWithPadding();
-        constr->apply(needsLogging, computeEnergy, count, 0, 1.0, ems->s.x.arrayRefWithPadding(), f,
-                      f.unpaddedArrayRef(), ems->s.box, ems->s.lambda[efptBONDED], &dvdl_constr,
-                      gmx::ArrayRefWithPadding<RVec>(), computeVirial, shake_vir,
-                      gmx::ConstraintVariable::ForceDispl);
-        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-        m_add(force_vir, shake_vir, vir);
-    }
-    else
-    {
-        copy_mat(force_vir, vir);
-    }
-
-    clear_mat(ekin);
-    enerd->term[F_PRES] = calc_pres(fr->pbcType, inputrec->nwall, ems->s.box, ekin, vir, pres);
-
-    if (inputrec->efep != efepNO)
-    {
-        accumulateKineticLambdaComponents(enerd, ems->s.lambda, *inputrec->fepvals);
-    }
-
-    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
-    {
-        get_state_f_norm_max(cr, &(inputrec->opts), mdAtoms->mdatoms(), ems);
-    }
-}
-
-} // namespace
-
-//! Parallel utility summing energies and forces
-static double reorder_partsum(const t_commrec*  cr,
-                              t_grpopts*        opts,
-                              const gmx_mtop_t* top_global,
-                              const em_state_t* s_min,
-                              const em_state_t* s_b)
-{
-    if (debug)
-    {
-        fprintf(debug, "Doing reorder_partsum\n");
-    }
-
-    auto fm = s_min->f.view().force();
-    auto fb = s_b->f.view().force();
-
-    /* Collect fm in a global vector fmg.
-     * This conflicts with the spirit of domain decomposition,
-     * but to fully optimize this a much more complicated algorithm is required.
-     */
-    const int natoms = top_global->natoms;
-    rvec*     fmg;
-    snew(fmg, natoms);
-
-    gmx::ArrayRef<const int> indicesMin = s_min->s.cg_gl;
-    int                      i          = 0;
-    for (int a : indicesMin)
-    {
-        copy_rvec(fm[i], fmg[a]);
-        i++;
-    }
-    gmx_sum(top_global->natoms * 3, fmg[0], cr);
-
-    /* Now we will determine the part of the sum for the cgs in state s_b */
-    gmx::ArrayRef<const int> indicesB = s_b->s.cg_gl;
-
-    double partsum                        = 0;
-    i                                     = 0;
-    int                                gf = 0;
-    gmx::ArrayRef<const unsigned char> grpnrFREEZE =
-            top_global->groups.groupNumbers[SimulationAtomGroupType::Freeze];
-    for (int a : indicesB)
-    {
-        if (!grpnrFREEZE.empty())
-        {
-            gf = grpnrFREEZE[i];
-        }
-        for (int m = 0; m < DIM; m++)
-        {
-            if (!opts->nFreeze[gf][m])
-            {
-                partsum += (fb[i][m] - fmg[a][m]) * fb[i][m];
-            }
-        }
-        i++;
-    }
-
-    sfree(fmg);
-
-    return partsum;
-}
-
-//! Print some stuff, like beta, whatever that means.
-static real pr_beta(const t_commrec*  cr,
-                    t_grpopts*        opts,
-                    t_mdatoms*        mdatoms,
-                    const gmx_mtop_t* top_global,
-                    const em_state_t* s_min,
-                    const em_state_t* s_b)
-{
-    double sum;
-
-    /* This is just the classical Polak-Ribiere calculation of beta;
-     * it looks a bit complicated since we take freeze groups into account,
-     * and might have to sum it in parallel runs.
-     */
-
-    if (!DOMAINDECOMP(cr)
-        || (s_min->s.ddp_count == cr->dd->ddp_count && s_b->s.ddp_count == cr->dd->ddp_count))
-    {
-        auto fm = s_min->f.view().force();
-        auto fb = s_b->f.view().force();
-        sum     = 0;
-        int gf  = 0;
-        /* This part of code can be incorrect with DD,
-         * since the atom ordering in s_b and s_min might differ.
-         */
-        for (int i = 0; i < mdatoms->homenr; i++)
-        {
-            if (mdatoms->cFREEZE)
-            {
-                gf = mdatoms->cFREEZE[i];
-            }
-            for (int m = 0; m < DIM; m++)
-            {
-                if (!opts->nFreeze[gf][m])
-                {
-                    sum += (fb[i][m] - fm[i][m]) * fb[i][m];
-                }
-            }
-        }
-    }
-    else
-    {
-        /* We need to reorder cgs while summing */
-        sum = reorder_partsum(cr, opts, top_global, s_min, s_b);
-    }
-    if (PAR(cr))
-    {
-        gmx_sumd(1, &sum, cr);
-    }
-
-    return sum / gmx::square(s_min->fnorm);
-}
-
-namespace gmx
-{
-
-void LegacySimulator::do_cg()
-{
-    const char* CG = "Polak-Ribiere Conjugate Gradients";
-
-    gmx_localtop_t    top(top_global->ffparams);
-    gmx_global_stat_t gstat;
-    double            tmp, minstep;
-    real              stepsize;
-    real              a, b, c, beta = 0.0;
-    real              epot_repl = 0;
-    real              pnorm;
-    gmx_bool          converged, foundlower;
-    rvec              mu_tot = { 0 };
-    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
-    tensor            vir, pres;
-    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
-    int               m, step, nminstep;
-    auto              mdatoms = mdAtoms->mdatoms();
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that activating conjugate gradient energy minimization via the "
-                    "integrator .mdp option and the command gmx mdrun may "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx minimize and an .mdp option.");
-
-    step = 0;
-
-    if (MASTER(cr))
-    {
-        // In CG, the state is extended with a search direction
-        state_global->flags |= (1 << estCGP);
-
-        // Ensure the extra per-atom state array gets allocated
-        state_change_natoms(state_global, state_global->natoms);
-
-        // Initialize the search direction to zero
-        for (RVec& cg_p : state_global->cg_p)
-        {
-            cg_p = { 0, 0, 0 };
-        }
-    }
-
-    /* Create 4 states on the stack and extract pointers that we will swap */
-    em_state_t  s0{}, s1{}, s2{}, s3{};
-    em_state_t* s_min = &s0;
-    em_state_t* s_a   = &s1;
-    em_state_t* s_b   = &s2;
-    em_state_t* s_c   = &s3;
-
-    /* Init em and store the local state in s_min */
-    init_em(fplog, mdlog, CG, cr, ms /*PLUMED*/, inputrec, imdSession, pull_work, state_global, top_global, s_min,
-            &top, nrnb, fr, mdAtoms, &gstat, vsite, constr, nullptr);
-    const bool        simulationsShareState = false;
-    gmx_mdoutf*       outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work,
-                                   nullptr, false, StartingBehavior::NewSimulation,
-                                   simulationsShareState, mdModulesNotifier);
-
-    /* Print to log file */
-    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
-
-    /* Max number of steps */
-    number_steps = inputrec->nsteps;
-
-    if (MASTER(cr))
-    {
-        sp_header(stderr, CG, inputrec->em_tol, number_steps);
-    }
-    if (fplog)
-    {
-        sp_header(fplog, CG, inputrec->em_tol, number_steps);
-    }
-
-    EnergyEvaluator energyEvaluator{ fplog,    mdlog,      cr,        ms,   top_global,      &top,
-                                     inputrec, imdSession, pull_work, nrnb, wcycle,          gstat,
-                                     vsite,    constr,     mdAtoms,   fr,   runScheduleWork, enerd };
-    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-    /* do_force always puts the charge groups in the box and shifts again
-     * We do not unshift, so molecules are always whole in congrad.c
-     */
-    energyEvaluator.run(s_min, mu_tot, vir, pres, -1, TRUE);
-
-    if (MASTER(cr))
-    {
-        /* Copy stuff to the energy bin for easy printing etc. */
-        matrix nullBox = {};
-        energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(step), mdatoms->tmass,
-                                         enerd, nullptr, nullptr, nullBox, PTCouplingArrays(), 0,
-                                         nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
-
-        EnergyOutput::printHeader(fplog, step, step);
-        energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step,
-                                           step, fr->fcdata.get(), nullptr);
-    }
-
-    /* Estimate/guess the initial stepsize */
-    stepsize = inputrec->em_stepsize / s_min->fnorm;
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", s_min->fmax, s_min->a_fmax + 1);
-        fprintf(stderr, "   F-Norm            = %12.5e\n", s_min->fnorm / sqrtNumAtoms);
-        fprintf(stderr, "\n");
-        /* and copy to the log file too... */
-        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", s_min->fmax, s_min->a_fmax + 1);
-        fprintf(fplog, "   F-Norm            = %12.5e\n", s_min->fnorm / sqrtNumAtoms);
-        fprintf(fplog, "\n");
-    }
-    /* Start the loop over CG steps.
-     * Each successful step is counted, and we continue until
-     * we either converge or reach the max number of steps.
-     */
-    converged = FALSE;
-    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
-    {
-
-        /* start taking steps in a new direction
-         * First time we enter the routine, beta=0, and the direction is
-         * simply the negative gradient.
-         */
-
-        /* Calculate the new direction in p, and the gradient in this direction, gpa */
-        gmx::ArrayRef<gmx::RVec>       pm  = s_min->s.cg_p;
-        gmx::ArrayRef<const gmx::RVec> sfm = s_min->f.view().force();
-        double                         gpa = 0;
-        int                            gf  = 0;
-        for (int i = 0; i < mdatoms->homenr; i++)
-        {
-            if (mdatoms->cFREEZE)
-            {
-                gf = mdatoms->cFREEZE[i];
-            }
-            for (m = 0; m < DIM; m++)
-            {
-                if (!inputrec->opts.nFreeze[gf][m])
-                {
-                    pm[i][m] = sfm[i][m] + beta * pm[i][m];
-                    gpa -= pm[i][m] * sfm[i][m];
-                    /* f is negative gradient, thus the sign */
-                }
-                else
-                {
-                    pm[i][m] = 0;
-                }
-            }
-        }
-
-        /* Sum the gradient along the line across CPUs */
-        if (PAR(cr))
-        {
-            gmx_sumd(1, &gpa, cr);
-        }
-
-        /* Calculate the norm of the search vector */
-        get_f_norm_max(cr, &(inputrec->opts), mdatoms, pm, &pnorm, nullptr, nullptr);
-
-        /* Just in case stepsize reaches zero due to numerical precision... */
-        if (stepsize <= 0)
-        {
-            stepsize = inputrec->em_stepsize / pnorm;
-        }
-
-        /*
-         * Double check the value of the derivative in the search direction.
-         * If it is positive it must be due to the old information in the
-         * CG formula, so just remove that and start over with beta=0.
-         * This corresponds to a steepest descent step.
-         */
-        if (gpa > 0)
-        {
-            beta = 0;
-            step--;   /* Don't count this step since we are restarting */
-            continue; /* Go back to the beginning of the big for-loop */
-        }
-
-        /* Calculate minimum allowed stepsize, before the average (norm)
-         * relative change in coordinate is smaller than precision
-         */
-        minstep      = 0;
-        auto s_min_x = makeArrayRef(s_min->s.x);
-        for (int i = 0; i < mdatoms->homenr; i++)
-        {
-            for (m = 0; m < DIM; m++)
-            {
-                tmp = fabs(s_min_x[i][m]);
-                if (tmp < 1.0)
-                {
-                    tmp = 1.0;
-                }
-                tmp = pm[i][m] / tmp;
-                minstep += tmp * tmp;
-            }
-        }
-        /* Add up from all CPUs */
-        if (PAR(cr))
-        {
-            gmx_sumd(1, &minstep, cr);
-        }
-
-        minstep = GMX_REAL_EPS / sqrt(minstep / (3 * top_global->natoms));
-
-        if (stepsize < minstep)
-        {
-            converged = TRUE;
-            break;
-        }
-
-        /* Write coordinates if necessary */
-        do_x = do_per_step(step, inputrec->nstxout);
-        do_f = do_per_step(step, inputrec->nstfout);
-
-        write_em_traj(fplog, cr, outf, do_x, do_f, nullptr, top_global, inputrec, step, s_min,
-                      state_global, observablesHistory);
-
-        /* Take a step downhill.
-         * In theory, we should minimize the function along this direction.
-         * That is quite possible, but it turns out to take 5-10 function evaluations
-         * for each line. However, we dont really need to find the exact minimum -
-         * it is much better to start a new CG step in a modified direction as soon
-         * as we are close to it. This will save a lot of energy evaluations.
-         *
-         * In practice, we just try to take a single step.
-         * If it worked (i.e. lowered the energy), we increase the stepsize but
-         * the continue straight to the next CG step without trying to find any minimum.
-         * If it didn't work (higher energy), there must be a minimum somewhere between
-         * the old position and the new one.
-         *
-         * Due to the finite numerical accuracy, it turns out that it is a good idea
-         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-         * This leads to lower final energies in the tests I've done. / Erik
-         */
-        s_a->epot = s_min->epot;
-        a         = 0.0;
-        c         = a + stepsize; /* reference position along line is zero */
-
-        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
-        {
-            em_dd_partition_system(fplog, mdlog, step, cr, top_global, inputrec, imdSession,
-                                   pull_work, s_min, &top, mdAtoms, fr, vsite, constr, nrnb, wcycle);
-        }
-
-        /* Take a trial step (new coords in s_c) */
-        do_em_step(cr, inputrec, mdatoms, s_min, c, s_min->s.cg_p.constArrayRefWithPadding(), s_c,
-                   constr, -1);
-
-        neval++;
-        /* Calculate energy for the trial step */
-        energyEvaluator.run(s_c, mu_tot, vir, pres, -1, FALSE);
-
-        /* Calc derivative along line */
-        const rvec*                    pc  = s_c->s.cg_p.rvec_array();
-        gmx::ArrayRef<const gmx::RVec> sfc = s_c->f.view().force();
-        double                         gpc = 0;
-        for (int i = 0; i < mdatoms->homenr; i++)
-        {
-            for (m = 0; m < DIM; m++)
-            {
-                gpc -= pc[i][m] * sfc[i][m]; /* f is negative gradient, thus the sign */
-            }
-        }
-        /* Sum the gradient along the line across CPUs */
-        if (PAR(cr))
-        {
-            gmx_sumd(1, &gpc, cr);
-        }
-
-        /* This is the max amount of increase in energy we tolerate */
-        tmp = std::sqrt(GMX_REAL_EPS) * fabs(s_a->epot);
-
-        /* Accept the step if the energy is lower, or if it is not significantly higher
-         * and the line derivative is still negative.
-         */
-        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
-        {
-            foundlower = TRUE;
-            /* Great, we found a better energy. Increase step for next iteration
-             * if we are still going down, decrease it otherwise
-             */
-            if (gpc < 0)
-            {
-                stepsize *= 1.618034; /* The golden section */
-            }
-            else
-            {
-                stepsize *= 0.618034; /* 1/golden section */
-            }
-        }
-        else
-        {
-            /* New energy is the same or higher. We will have to do some work
-             * to find a smaller value in the interval. Take smaller step next time!
-             */
-            foundlower = FALSE;
-            stepsize *= 0.618034;
-        }
-
-
-        /* OK, if we didn't find a lower value we will have to locate one now - there must
-         * be one in the interval [a=0,c].
-         * The same thing is valid here, though: Don't spend dozens of iterations to find
-         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-         *
-         * I also have a safeguard for potentially really pathological functions so we never
-         * take more than 20 steps before we give up ...
-         *
-         * If we already found a lower value we just skip this step and continue to the update.
-         */
-        double gpb;
-        if (!foundlower)
-        {
-            nminstep = 0;
-
-            do
-            {
-                /* Select a new trial point.
-                 * If the derivatives at points a & c have different sign we interpolate to zero,
-                 * otherwise just do a bisection.
-                 */
-                if (gpa < 0 && gpc > 0)
-                {
-                    b = a + gpa * (a - c) / (gpc - gpa);
-                }
-                else
-                {
-                    b = 0.5 * (a + c);
-                }
-
-                /* safeguard if interpolation close to machine accuracy causes errors:
-                 * never go outside the interval
-                 */
-                if (b <= a || b >= c)
-                {
-                    b = 0.5 * (a + c);
-                }
-
-                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-                {
-                    /* Reload the old state */
-                    em_dd_partition_system(fplog, mdlog, -1, cr, top_global, inputrec, imdSession, pull_work,
-                                           s_min, &top, mdAtoms, fr, vsite, constr, nrnb, wcycle);
-                }
-
-                /* Take a trial step to this new point - new coords in s_b */
-                do_em_step(cr, inputrec, mdatoms, s_min, b,
-                           s_min->s.cg_p.constArrayRefWithPadding(), s_b, constr, -1);
-
-                neval++;
-                /* Calculate energy for the trial step */
-                energyEvaluator.run(s_b, mu_tot, vir, pres, -1, FALSE);
-
-                /* p does not change within a step, but since the domain decomposition
-                 * might change, we have to use cg_p of s_b here.
-                 */
-                const rvec*                    pb  = s_b->s.cg_p.rvec_array();
-                gmx::ArrayRef<const gmx::RVec> sfb = s_b->f.view().force();
-                gpb                                = 0;
-                for (int i = 0; i < mdatoms->homenr; i++)
-                {
-                    for (m = 0; m < DIM; m++)
-                    {
-                        gpb -= pb[i][m] * sfb[i][m]; /* f is negative gradient, thus the sign */
-                    }
-                }
-                /* Sum the gradient along the line across CPUs */
-                if (PAR(cr))
-                {
-                    gmx_sumd(1, &gpb, cr);
-                }
-
-                if (debug)
-                {
-                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n", s_a->epot, s_b->epot,
-                            s_c->epot, gpb);
-                }
-
-                epot_repl = s_b->epot;
-
-                /* Keep one of the intervals based on the value of the derivative at the new point */
-                if (gpb > 0)
-                {
-                    /* Replace c endpoint with b */
-                    swap_em_state(&s_b, &s_c);
-                    c   = b;
-                    gpc = gpb;
-                }
-                else
-                {
-                    /* Replace a endpoint with b */
-                    swap_em_state(&s_b, &s_a);
-                    a   = b;
-                    gpa = gpb;
-                }
-
-                /*
-                 * Stop search as soon as we find a value smaller than the endpoints.
-                 * Never run more than 20 steps, no matter what.
-                 */
-                nminstep++;
-            } while ((epot_repl > s_a->epot || epot_repl > s_c->epot) && (nminstep < 20));
-
-            if (std::fabs(epot_repl - s_min->epot) < fabs(s_min->epot) * GMX_REAL_EPS || nminstep >= 20)
-            {
-                /* OK. We couldn't find a significantly lower energy.
-                 * If beta==0 this was steepest descent, and then we give up.
-                 * If not, set beta=0 and restart with steepest descent before quitting.
-                 */
-                if (beta == 0.0)
-                {
-                    /* Converged */
-                    converged = TRUE;
-                    break;
-                }
-                else
-                {
-                    /* Reset memory before giving up */
-                    beta = 0.0;
-                    continue;
-                }
-            }
-
-            /* Select min energy state of A & C, put the best in B.
-             */
-            if (s_c->epot < s_a->epot)
-            {
-                if (debug)
-                {
-                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n", s_c->epot,
-                            s_a->epot);
-                }
-                swap_em_state(&s_b, &s_c);
-                gpb = gpc;
-            }
-            else
-            {
-                if (debug)
-                {
-                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n", s_a->epot,
-                            s_c->epot);
-                }
-                swap_em_state(&s_b, &s_a);
-                gpb = gpa;
-            }
-        }
-        else
-        {
-            if (debug)
-            {
-                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n", s_c->epot);
-            }
-            swap_em_state(&s_b, &s_c);
-            gpb = gpc;
-        }
-
-        /* new search direction */
-        /* beta = 0 means forget all memory and restart with steepest descents. */
-        if (nstcg && ((step % nstcg) == 0))
-        {
-            beta = 0.0;
-        }
-        else
-        {
-            /* s_min->fnorm cannot be zero, because then we would have converged
-             * and broken out.
-             */
-
-            /* Polak-Ribiere update.
-             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
-             */
-            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
-        }
-        /* Limit beta to prevent oscillations */
-        if (fabs(beta) > 5.0)
-        {
-            beta = 0.0;
-        }
-
-
-        /* update positions */
-        swap_em_state(&s_min, &s_b);
-        gpa = gpb;
-
-        /* Print it if necessary */
-        if (MASTER(cr))
-        {
-            if (mdrunOptions.verbose)
-            {
-                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n", step,
-                        s_min->epot, s_min->fnorm / sqrtNumAtoms, s_min->fmax, s_min->a_fmax + 1);
-                fflush(stderr);
-            }
-            /* Store the new (lower) energies */
-            matrix nullBox = {};
-            energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(step), mdatoms->tmass,
-                                             enerd, nullptr, nullptr, nullBox, PTCouplingArrays(), 0,
-                                             nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
-
-            do_log = do_per_step(step, inputrec->nstlog);
-            do_ene = do_per_step(step, inputrec->nstenergy);
-
-            imdSession->fillEnergyRecord(step, TRUE);
-
-            if (do_log)
-            {
-                EnergyOutput::printHeader(fplog, step, step);
-            }
-            energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-                                               do_log ? fplog : nullptr, step, step,
-                                               fr->fcdata.get(), nullptr);
-        }
-
-        /* Send energies and positions to the IMD client if bIMD is TRUE. */
-        if (MASTER(cr) && imdSession->run(step, TRUE, state_global->box, state_global->x.rvec_array(), 0))
-        {
-            imdSession->sendPositionsAndEnergies();
-        }
-
-        /* Stop when the maximum force lies below tolerance.
-         * If we have reached machine precision, converged is already set to true.
-         */
-        converged = converged || (s_min->fmax < inputrec->em_tol);
-
-    } /* End of the loop */
-
-    if (converged)
-    {
-        step--; /* we never took that last step in this case */
-    }
-    if (s_min->fmax > inputrec->em_tol)
-    {
-        if (MASTER(cr))
-        {
-            warn_step(fplog, inputrec->em_tol, s_min->fmax, step - 1 == number_steps, FALSE);
-        }
-        converged = FALSE;
-    }
-
-    if (MASTER(cr))
-    {
-        /* If we printed energy and/or logfile last step (which was the last step)
-         * we don't have to do it again, but otherwise print the final values.
-         */
-        if (!do_log)
-        {
-            /* Write final value to log since we didn't do anything the last step */
-            EnergyOutput::printHeader(fplog, step, step);
-        }
-        if (!do_ene || !do_log)
-        {
-            /* Write final energy file entries */
-            energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-                                               !do_log ? fplog : nullptr, step, step,
-                                               fr->fcdata.get(), nullptr);
-        }
-    }
-
-    /* Print some stuff... */
-    if (MASTER(cr))
-    {
-        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-    }
-
-    /* IMPORTANT!
-     * For accurate normal mode calculation it is imperative that we
-     * store the last conformation into the full precision binary trajectory.
-     *
-     * However, we should only do it if we did NOT already write this step
-     * above (which we did if do_x or do_f was true).
-     */
-    /* Note that with 0 < nstfout != nstxout we can end up with two frames
-     * in the trajectory with the same step number.
-     */
-    do_x = !do_per_step(step, inputrec->nstxout);
-    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
-
-    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm), top_global, inputrec,
-                  step, s_min, state_global, observablesHistory);
-
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps, s_min, sqrtNumAtoms);
-        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps, s_min, sqrtNumAtoms);
-
-        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-    }
-
-    finish_em(cr, outf, walltime_accounting, wcycle);
-
-    /* To print the actual number of steps we needed somewhere */
-    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-}
-
-
-void LegacySimulator::do_lbfgs()
-{
-    static const char* LBFGS = "Low-Memory BFGS Minimizer";
-    em_state_t         ems;
-    gmx_localtop_t     top(top_global->ffparams);
-    gmx_global_stat_t  gstat;
-    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
-    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
-    real *             rho, *alpha, *p, *s, **dx, **dg;
-    real               a, b, c, maxdelta, delta;
-    real               diag, Epot0;
-    real               dgdx, dgdg, sq, yr, beta;
-    gmx_bool           converged;
-    rvec               mu_tot = { 0 };
-    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
-    tensor             vir, pres;
-    int                start, end, number_steps;
-    int                i, k, m, n, gf, step;
-    int                mdof_flags;
-    auto               mdatoms = mdAtoms->mdatoms();
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that activating L-BFGS energy minimization via the "
-                    "integrator .mdp option and the command gmx mdrun may "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx minimize and an .mdp option.");
-
-    if (PAR(cr))
-    {
-        gmx_fatal(FARGS, "L-BFGS minimization only supports a single rank");
-    }
-
-    if (nullptr != constr)
-    {
-        gmx_fatal(
-                FARGS,
-                "The combination of constraints and L-BFGS minimization is not implemented. Either "
-                "do not use constraints, or use another minimizer (e.g. steepest descent).");
-    }
-
-    n        = 3 * state_global->natoms;
-    nmaxcorr = inputrec->nbfgscorr;
-
-    snew(frozen, n);
-
-    snew(p, n);
-    snew(rho, nmaxcorr);
-    snew(alpha, nmaxcorr);
-
-    snew(dx, nmaxcorr);
-    for (i = 0; i < nmaxcorr; i++)
-    {
-        snew(dx[i], n);
-    }
-
-    snew(dg, nmaxcorr);
-    for (i = 0; i < nmaxcorr; i++)
-    {
-        snew(dg[i], n);
-    }
-
-    step  = 0;
-    neval = 0;
-
-    /* Init em */
-    init_em(fplog, mdlog, LBFGS, cr, ms /*PLUMED*/, inputrec, imdSession, pull_work, state_global, top_global,
-            &ems, &top, nrnb, fr, mdAtoms, &gstat, vsite, constr, nullptr);
-    const bool        simulationsShareState = false;
-    gmx_mdoutf*       outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work,
-                                   nullptr, false, StartingBehavior::NewSimulation,
-                                   simulationsShareState, mdModulesNotifier);
-
-    start = 0;
-    end   = mdatoms->homenr;
-
-    /* We need 4 working states */
-    em_state_t  s0{}, s1{}, s2{}, s3{};
-    em_state_t* sa   = &s0;
-    em_state_t* sb   = &s1;
-    em_state_t* sc   = &s2;
-    em_state_t* last = &s3;
-    /* Initialize by copying the state from ems (we could skip x and f here) */
-    *sa = ems;
-    *sb = ems;
-    *sc = ems;
-
-    /* Print to log file */
-    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
-
-    do_log = do_ene = do_x = do_f = TRUE;
-
-    /* Max number of steps */
-    number_steps = inputrec->nsteps;
-
-    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
-    gf = 0;
-    for (i = start; i < end; i++)
-    {
-        if (mdatoms->cFREEZE)
-        {
-            gf = mdatoms->cFREEZE[i];
-        }
-        for (m = 0; m < DIM; m++)
-        {
-            frozen[3 * i + m] = (inputrec->opts.nFreeze[gf][m] != 0);
-        }
-    }
-    if (MASTER(cr))
-    {
-        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
-    }
-    if (fplog)
-    {
-        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
-    }
-
-    if (vsite)
-    {
-        vsite->construct(state_global->x, 1, {}, state_global->box);
-    }
-
-    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-    /* do_force always puts the charge groups in the box and shifts again
-     * We do not unshift, so molecules are always whole
-     */
-    neval++;
-    EnergyEvaluator energyEvaluator{ fplog,    mdlog,      cr,        ms,   top_global,      &top,
-                                     inputrec, imdSession, pull_work, nrnb, wcycle,          gstat,
-                                     vsite,    constr,     mdAtoms,   fr,   runScheduleWork, enerd };
-    energyEvaluator.run(&ems, mu_tot, vir, pres, -1, TRUE);
-
-    if (MASTER(cr))
-    {
-        /* Copy stuff to the energy bin for easy printing etc. */
-        matrix nullBox = {};
-        energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(step), mdatoms->tmass,
-                                         enerd, nullptr, nullptr, nullBox, PTCouplingArrays(), 0,
-                                         nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
-
-        EnergyOutput::printHeader(fplog, step, step);
-        energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step,
-                                           step, fr->fcdata.get(), nullptr);
-    }
-
-    /* Set the initial step.
-     * since it will be multiplied by the non-normalized search direction
-     * vector (force vector the first time), we scale it by the
-     * norm of the force.
-     */
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
-        fprintf(stderr, "   F-Norm            = %12.5e\n", ems.fnorm / sqrtNumAtoms);
-        fprintf(stderr, "\n");
-        /* and copy to the log file too... */
-        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
-        fprintf(fplog, "   F-Norm            = %12.5e\n", ems.fnorm / sqrtNumAtoms);
-        fprintf(fplog, "\n");
-    }
-
-    // Point is an index to the memory of search directions, where 0 is the first one.
-    point = 0;
-
-    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
-    real* fInit = static_cast<real*>(ems.f.view().force().data()[0]);
-    for (i = 0; i < n; i++)
-    {
-        if (!frozen[i])
-        {
-            dx[point][i] = fInit[i]; /* Initial search direction */
-        }
-        else
-        {
-            dx[point][i] = 0;
-        }
-    }
-
-    // Stepsize will be modified during the search, and actually it is not critical
-    // (the main efficiency in the algorithm comes from changing directions), but
-    // we still need an initial value, so estimate it as the inverse of the norm
-    // so we take small steps where the potential fluctuates a lot.
-    stepsize = 1.0 / ems.fnorm;
-
-    /* Start the loop over BFGS steps.
-     * Each successful step is counted, and we continue until
-     * we either converge or reach the max number of steps.
-     */
-
-    ncorr = 0;
-
-    /* Set the gradient from the force */
-    converged = FALSE;
-    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
-    {
-
-        /* Write coordinates if necessary */
-        do_x = do_per_step(step, inputrec->nstxout);
-        do_f = do_per_step(step, inputrec->nstfout);
-
-        mdof_flags = 0;
-        if (do_x)
-        {
-            mdof_flags |= MDOF_X;
-        }
-
-        if (do_f)
-        {
-            mdof_flags |= MDOF_F;
-        }
-
-        if (inputrec->bIMD)
-        {
-            mdof_flags |= MDOF_IMD;
-        }
-
-        gmx::WriteCheckpointDataHolder checkpointDataHolder;
-        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags, top_global->natoms, step,
-                                         static_cast<real>(step), &ems.s, state_global, observablesHistory,
-                                         ems.f.view().force(), &checkpointDataHolder);
-
-        /* Do the linesearching in the direction dx[point][0..(n-1)] */
-
-        /* make s a pointer to current search direction - point=0 first time we get here */
-        s = dx[point];
-
-        real* xx = static_cast<real*>(ems.s.x.rvec_array()[0]);
-        real* ff = static_cast<real*>(ems.f.view().force().data()[0]);
-
-        // calculate line gradient in position A
-        for (gpa = 0, i = 0; i < n; i++)
-        {
-            gpa -= s[i] * ff[i];
-        }
-
-        /* Calculate minimum allowed stepsize along the line, before the average (norm)
-         * relative change in coordinate is smaller than precision
-         */
-        for (minstep = 0, i = 0; i < n; i++)
-        {
-            tmp = fabs(xx[i]);
-            if (tmp < 1.0)
-            {
-                tmp = 1.0;
-            }
-            tmp = s[i] / tmp;
-            minstep += tmp * tmp;
-        }
-        minstep = GMX_REAL_EPS / sqrt(minstep / n);
-
-        if (stepsize < minstep)
-        {
-            converged = TRUE;
-            break;
-        }
-
-        // Before taking any steps along the line, store the old position
-        *last       = ems;
-        real* lastx = static_cast<real*>(last->s.x.data()[0]);
-        real* lastf = static_cast<real*>(last->f.view().force().data()[0]);
-        Epot0       = ems.epot;
-
-        *sa = ems;
-
-        /* Take a step downhill.
-         * In theory, we should find the actual minimum of the function in this
-         * direction, somewhere along the line.
-         * That is quite possible, but it turns out to take 5-10 function evaluations
-         * for each line. However, we dont really need to find the exact minimum -
-         * it is much better to start a new BFGS step in a modified direction as soon
-         * as we are close to it. This will save a lot of energy evaluations.
-         *
-         * In practice, we just try to take a single step.
-         * If it worked (i.e. lowered the energy), we increase the stepsize but
-         * continue straight to the next BFGS step without trying to find any minimum,
-         * i.e. we change the search direction too. If the line was smooth, it is
-         * likely we are in a smooth region, and then it makes sense to take longer
-         * steps in the modified search direction too.
-         *
-         * If it didn't work (higher energy), there must be a minimum somewhere between
-         * the old position and the new one. Then we need to start by finding a lower
-         * value before we change search direction. Since the energy was apparently
-         * quite rough, we need to decrease the step size.
-         *
-         * Due to the finite numerical accuracy, it turns out that it is a good idea
-         * to accept a SMALL increase in energy, if the derivative is still downhill.
-         * This leads to lower final energies in the tests I've done. / Erik
-         */
-
-        // State "A" is the first position along the line.
-        // reference position along line is initially zero
-        a = 0.0;
-
-        // Check stepsize first. We do not allow displacements
-        // larger than emstep.
-        //
-        do
-        {
-            // Pick a new position C by adding stepsize to A.
-            c = a + stepsize;
-
-            // Calculate what the largest change in any individual coordinate
-            // would be (translation along line * gradient along line)
-            maxdelta = 0;
-            for (i = 0; i < n; i++)
-            {
-                delta = c * s[i];
-                if (delta > maxdelta)
-                {
-                    maxdelta = delta;
-                }
-            }
-            // If any displacement is larger than the stepsize limit, reduce the step
-            if (maxdelta > inputrec->em_stepsize)
-            {
-                stepsize *= 0.1;
-            }
-        } while (maxdelta > inputrec->em_stepsize);
-
-        // Take a trial step and move the coordinate array xc[] to position C
-        real* xc = static_cast<real*>(sc->s.x.rvec_array()[0]);
-        for (i = 0; i < n; i++)
-        {
-            xc[i] = lastx[i] + c * s[i];
-        }
-
-        neval++;
-        // Calculate energy for the trial step in position C
-        energyEvaluator.run(sc, mu_tot, vir, pres, step, FALSE);
-
-        // Calc line gradient in position C
-        real* fc = static_cast<real*>(sc->f.view().force()[0]);
-        for (gpc = 0, i = 0; i < n; i++)
-        {
-            gpc -= s[i] * fc[i]; /* f is negative gradient, thus the sign */
-        }
-        /* Sum the gradient along the line across CPUs */
-        if (PAR(cr))
-        {
-            gmx_sumd(1, &gpc, cr);
-        }
-
-        // This is the max amount of increase in energy we tolerate.
-        // By allowing VERY small changes (close to numerical precision) we
-        // frequently find even better (lower) final energies.
-        tmp = std::sqrt(GMX_REAL_EPS) * fabs(sa->epot);
-
-        // Accept the step if the energy is lower in the new position C (compared to A),
-        // or if it is not significantly higher and the line derivative is still negative.
-        foundlower = sc->epot < sa->epot || (gpc < 0 && sc->epot < (sa->epot + tmp));
-        // If true, great, we found a better energy. We no longer try to alter the
-        // stepsize, but simply accept this new better position. The we select a new
-        // search direction instead, which will be much more efficient than continuing
-        // to take smaller steps along a line. Set fnorm based on the new C position,
-        // which will be used to update the stepsize to 1/fnorm further down.
-
-        // If false, the energy is NOT lower in point C, i.e. it will be the same
-        // or higher than in point A. In this case it is pointless to move to point C,
-        // so we will have to do more iterations along the same line to find a smaller
-        // value in the interval [A=0.0,C].
-        // Here, A is still 0.0, but that will change when we do a search in the interval
-        // [0.0,C] below. That search we will do by interpolation or bisection rather
-        // than with the stepsize, so no need to modify it. For the next search direction
-        // it will be reset to 1/fnorm anyway.
-
-        if (!foundlower)
-        {
-            // OK, if we didn't find a lower value we will have to locate one now - there must
-            // be one in the interval [a,c].
-            // The same thing is valid here, though: Don't spend dozens of iterations to find
-            // the line minimum. We try to interpolate based on the derivative at the endpoints,
-            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
-            // I also have a safeguard for potentially really pathological functions so we never
-            // take more than 20 steps before we give up.
-            // If we already found a lower value we just skip this step and continue to the update.
-            real fnorm = 0;
-            nminstep   = 0;
-            do
-            {
-                // Select a new trial point B in the interval [A,C].
-                // If the derivatives at points a & c have different sign we interpolate to zero,
-                // otherwise just do a bisection since there might be multiple minima/maxima
-                // inside the interval.
-                if (gpa < 0 && gpc > 0)
-                {
-                    b = a + gpa * (a - c) / (gpc - gpa);
-                }
-                else
-                {
-                    b = 0.5 * (a + c);
-                }
-
-                /* safeguard if interpolation close to machine accuracy causes errors:
-                 * never go outside the interval
-                 */
-                if (b <= a || b >= c)
-                {
-                    b = 0.5 * (a + c);
-                }
-
-                // Take a trial step to point B
-                real* xb = static_cast<real*>(sb->s.x.rvec_array()[0]);
-                for (i = 0; i < n; i++)
-                {
-                    xb[i] = lastx[i] + b * s[i];
-                }
-
-                neval++;
-                // Calculate energy for the trial step in point B
-                energyEvaluator.run(sb, mu_tot, vir, pres, step, FALSE);
-                fnorm = sb->fnorm;
-
-                // Calculate gradient in point B
-                real* fb = static_cast<real*>(sb->f.view().force()[0]);
-                for (gpb = 0, i = 0; i < n; i++)
-                {
-                    gpb -= s[i] * fb[i]; /* f is negative gradient, thus the sign */
-                }
-                /* Sum the gradient along the line across CPUs */
-                if (PAR(cr))
-                {
-                    gmx_sumd(1, &gpb, cr);
-                }
-
-                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
-                // at the new point B, and rename the endpoints of this new interval A and C.
-                if (gpb > 0)
-                {
-                    /* Replace c endpoint with b */
-                    c = b;
-                    /* copy state b to c */
-                    *sc = *sb;
-                }
-                else
-                {
-                    /* Replace a endpoint with b */
-                    a = b;
-                    /* copy state b to a */
-                    *sa = *sb;
-                }
-
-                /*
-                 * Stop search as soon as we find a value smaller than the endpoints,
-                 * or if the tolerance is below machine precision.
-                 * Never run more than 20 steps, no matter what.
-                 */
-                nminstep++;
-            } while ((sb->epot > sa->epot || sb->epot > sc->epot) && (nminstep < 20));
-
-            if (std::fabs(sb->epot - Epot0) < GMX_REAL_EPS || nminstep >= 20)
-            {
-                /* OK. We couldn't find a significantly lower energy.
-                 * If ncorr==0 this was steepest descent, and then we give up.
-                 * If not, reset memory to restart as steepest descent before quitting.
-                 */
-                if (ncorr == 0)
-                {
-                    /* Converged */
-                    converged = TRUE;
-                    break;
-                }
-                else
-                {
-                    /* Reset memory */
-                    ncorr = 0;
-                    /* Search in gradient direction */
-                    for (i = 0; i < n; i++)
-                    {
-                        dx[point][i] = ff[i];
-                    }
-                    /* Reset stepsize */
-                    stepsize = 1.0 / fnorm;
-                    continue;
-                }
-            }
-
-            /* Select min energy state of A & C, put the best in xx/ff/Epot
-             */
-            if (sc->epot < sa->epot)
-            {
-                /* Use state C */
-                ems        = *sc;
-                step_taken = c;
-            }
-            else
-            {
-                /* Use state A */
-                ems        = *sa;
-                step_taken = a;
-            }
-        }
-        else
-        {
-            /* found lower */
-            /* Use state C */
-            ems        = *sc;
-            step_taken = c;
-        }
-
-        /* Update the memory information, and calculate a new
-         * approximation of the inverse hessian
-         */
-
-        /* Have new data in Epot, xx, ff */
-        if (ncorr < nmaxcorr)
-        {
-            ncorr++;
-        }
-
-        for (i = 0; i < n; i++)
-        {
-            dg[point][i] = lastf[i] - ff[i];
-            dx[point][i] *= step_taken;
-        }
-
-        dgdg = 0;
-        dgdx = 0;
-        for (i = 0; i < n; i++)
-        {
-            dgdg += dg[point][i] * dg[point][i];
-            dgdx += dg[point][i] * dx[point][i];
-        }
-
-        diag = dgdx / dgdg;
-
-        rho[point] = 1.0 / dgdx;
-        point++;
-
-        if (point >= nmaxcorr)
-        {
-            point = 0;
-        }
-
-        /* Update */
-        for (i = 0; i < n; i++)
-        {
-            p[i] = ff[i];
-        }
-
-        cp = point;
-
-        /* Recursive update. First go back over the memory points */
-        for (k = 0; k < ncorr; k++)
-        {
-            cp--;
-            if (cp < 0)
-            {
-                cp = ncorr - 1;
-            }
-
-            sq = 0;
-            for (i = 0; i < n; i++)
-            {
-                sq += dx[cp][i] * p[i];
-            }
-
-            alpha[cp] = rho[cp] * sq;
-
-            for (i = 0; i < n; i++)
-            {
-                p[i] -= alpha[cp] * dg[cp][i];
-            }
-        }
-
-        for (i = 0; i < n; i++)
-        {
-            p[i] *= diag;
-        }
-
-        /* And then go forward again */
-        for (k = 0; k < ncorr; k++)
-        {
-            yr = 0;
-            for (i = 0; i < n; i++)
-            {
-                yr += p[i] * dg[cp][i];
-            }
-
-            beta = rho[cp] * yr;
-            beta = alpha[cp] - beta;
-
-            for (i = 0; i < n; i++)
-            {
-                p[i] += beta * dx[cp][i];
-            }
-
-            cp++;
-            if (cp >= ncorr)
-            {
-                cp = 0;
-            }
-        }
-
-        for (i = 0; i < n; i++)
-        {
-            if (!frozen[i])
-            {
-                dx[point][i] = p[i];
-            }
-            else
-            {
-                dx[point][i] = 0;
-            }
-        }
-
-        /* Print it if necessary */
-        if (MASTER(cr))
-        {
-            if (mdrunOptions.verbose)
-            {
-                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n", step,
-                        ems.epot, ems.fnorm / sqrtNumAtoms, ems.fmax, ems.a_fmax + 1);
-                fflush(stderr);
-            }
-            /* Store the new (lower) energies */
-            matrix nullBox = {};
-            energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(step), mdatoms->tmass,
-                                             enerd, nullptr, nullptr, nullBox, PTCouplingArrays(), 0,
-                                             nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
-
-            do_log = do_per_step(step, inputrec->nstlog);
-            do_ene = do_per_step(step, inputrec->nstenergy);
-
-            imdSession->fillEnergyRecord(step, TRUE);
-
-            if (do_log)
-            {
-                EnergyOutput::printHeader(fplog, step, step);
-            }
-            energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-                                               do_log ? fplog : nullptr, step, step,
-                                               fr->fcdata.get(), nullptr);
-        }
-
-        /* Send x and E to IMD client, if bIMD is TRUE. */
-        if (imdSession->run(step, TRUE, state_global->box, state_global->x.rvec_array(), 0) && MASTER(cr))
-        {
-            imdSession->sendPositionsAndEnergies();
-        }
-
-        // Reset stepsize in we are doing more iterations
-        stepsize = 1.0;
-
-        /* Stop when the maximum force lies below tolerance.
-         * If we have reached machine precision, converged is already set to true.
-         */
-        converged = converged || (ems.fmax < inputrec->em_tol);
-
-    } /* End of the loop */
-
-    if (converged)
-    {
-        step--; /* we never took that last step in this case */
-    }
-    if (ems.fmax > inputrec->em_tol)
-    {
-        if (MASTER(cr))
-        {
-            warn_step(fplog, inputrec->em_tol, ems.fmax, step - 1 == number_steps, FALSE);
-        }
-        converged = FALSE;
-    }
-
-    /* If we printed energy and/or logfile last step (which was the last step)
-     * we don't have to do it again, but otherwise print the final values.
-     */
-    if (!do_log) /* Write final value to log since we didn't do anythin last step */
-    {
-        EnergyOutput::printHeader(fplog, step, step);
-    }
-    if (!do_ene || !do_log) /* Write final energy file entries */
-    {
-        energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-                                           !do_log ? fplog : nullptr, step, step, fr->fcdata.get(),
-                                           nullptr);
-    }
-
-    /* Print some stuff... */
-    if (MASTER(cr))
-    {
-        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-    }
-
-    /* IMPORTANT!
-     * For accurate normal mode calculation it is imperative that we
-     * store the last conformation into the full precision binary trajectory.
-     *
-     * However, we should only do it if we did NOT already write this step
-     * above (which we did if do_x or do_f was true).
-     */
-    do_x = !do_per_step(step, inputrec->nstxout);
-    do_f = !do_per_step(step, inputrec->nstfout);
-    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm), top_global, inputrec,
-                  step, &ems, state_global, observablesHistory);
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged, number_steps, &ems, sqrtNumAtoms);
-        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged, number_steps, &ems, sqrtNumAtoms);
-
-        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-    }
-
-    finish_em(cr, outf, walltime_accounting, wcycle);
-
-    /* To print the actual number of steps we needed somewhere */
-    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-}
-
-void LegacySimulator::do_steep()
-{
-    const char*       SD = "Steepest Descents";
-    gmx_localtop_t    top(top_global->ffparams);
-    gmx_global_stat_t gstat;
-    real              stepsize;
-    real              ustep;
-    gmx_bool          bDone, bAbort, do_x, do_f;
-    tensor            vir, pres;
-    rvec              mu_tot = { 0 };
-    int               nsteps;
-    int               count          = 0;
-    int               steps_accepted = 0;
-    auto              mdatoms        = mdAtoms->mdatoms();
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that activating steepest-descent energy minimization via the "
-                    "integrator .mdp option and the command gmx mdrun may "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx minimize and an .mdp option.");
-
-    /* Create 2 states on the stack and extract pointers that we will swap */
-    em_state_t  s0{}, s1{};
-    em_state_t* s_min = &s0;
-    em_state_t* s_try = &s1;
-
-    /* Init em and store the local state in s_try */
-    init_em(fplog, mdlog, SD, cr, ms /*PLUMED*/, inputrec, imdSession, pull_work, state_global, top_global, s_try,
-            &top, nrnb, fr, mdAtoms, &gstat, vsite, constr, nullptr);
-    const bool        simulationsShareState = false;
-    gmx_mdoutf*       outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work,
-                                   nullptr, false, StartingBehavior::NewSimulation,
-                                   simulationsShareState, mdModulesNotifier);
-
-    /* Print to log file  */
-    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
-
-    /* Set variables for stepsize (in nm). This is the largest
-     * step that we are going to make in any direction.
-     */
-    ustep    = inputrec->em_stepsize;
-    stepsize = 0;
-
-    /* Max number of steps  */
-    nsteps = inputrec->nsteps;
-
-    if (MASTER(cr))
-    {
-        /* Print to the screen  */
-        sp_header(stderr, SD, inputrec->em_tol, nsteps);
-    }
-    if (fplog)
-    {
-        sp_header(fplog, SD, inputrec->em_tol, nsteps);
-    }
-    EnergyEvaluator energyEvaluator{ fplog,    mdlog,      cr,        ms,   top_global,      &top,
-                                     inputrec, imdSession, pull_work, nrnb, wcycle,          gstat,
-                                     vsite,    constr,     mdAtoms,   fr,   runScheduleWork, enerd };
-
-    /**** HERE STARTS THE LOOP ****
-     * count is the counter for the number of steps
-     * bDone will be TRUE when the minimization has converged
-     * bAbort will be TRUE when nsteps steps have been performed or when
-     * the stepsize becomes smaller than is reasonable for machine precision
-     */
-    count  = 0;
-    bDone  = FALSE;
-    bAbort = FALSE;
-    while (!bDone && !bAbort)
-    {
-        bAbort = (nsteps >= 0) && (count == nsteps);
-
-        /* set new coordinates, except for first step */
-        bool validStep = true;
-        if (count > 0)
-        {
-            validStep = do_em_step(cr, inputrec, mdatoms, s_min, stepsize,
-                                   s_min->f.view().forceWithPadding(), s_try, constr, count);
-        }
-
-        if (validStep)
-        {
-            energyEvaluator.run(s_try, mu_tot, vir, pres, count, count == 0);
-        }
-        else
-        {
-            // Signal constraint error during stepping with energy=inf
-            s_try->epot = std::numeric_limits<real>::infinity();
-        }
-
-        if (MASTER(cr))
-        {
-            EnergyOutput::printHeader(fplog, count, count);
-        }
-
-        if (count == 0)
-        {
-            s_min->epot = s_try->epot;
-        }
-
-        /* Print it if necessary  */
-        if (MASTER(cr))
-        {
-            if (mdrunOptions.verbose)
-            {
-                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
-                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax + 1,
-                        ((count == 0) || (s_try->epot < s_min->epot)) ? '\n' : '\r');
-                fflush(stderr);
-            }
-
-            if ((count == 0) || (s_try->epot < s_min->epot))
-            {
-                /* Store the new (lower) energies  */
-                matrix nullBox = {};
-                energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(count),
-                                                 mdatoms->tmass, enerd, nullptr, nullptr, nullBox,
-                                                 PTCouplingArrays(), 0, nullptr, nullptr, vir, pres,
-                                                 nullptr, mu_tot, constr);
-
-                imdSession->fillEnergyRecord(count, TRUE);
-
-                const bool do_dr = do_per_step(steps_accepted, inputrec->nstdisreout);
-                const bool do_or = do_per_step(steps_accepted, inputrec->nstorireout);
-                energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), TRUE, do_dr, do_or,
-                                                   fplog, count, count, fr->fcdata.get(), nullptr);
-                fflush(fplog);
-            }
-        }
-
-        /* Now if the new energy is smaller than the previous...
-         * or if this is the first step!
-         * or if we did random steps!
-         */
-
-        if ((count == 0) || (s_try->epot < s_min->epot))
-        {
-            steps_accepted++;
-
-            /* Test whether the convergence criterion is met...  */
-            bDone = (s_try->fmax < inputrec->em_tol);
-
-            /* Copy the arrays for force, positions and energy  */
-            /* The 'Min' array always holds the coords and forces of the minimal
-               sampled energy  */
-            swap_em_state(&s_min, &s_try);
-            if (count > 0)
-            {
-                ustep *= 1.2;
-            }
-
-            /* Write to trn, if necessary */
-            do_x = do_per_step(steps_accepted, inputrec->nstxout);
-            do_f = do_per_step(steps_accepted, inputrec->nstfout);
-            write_em_traj(fplog, cr, outf, do_x, do_f, nullptr, top_global, inputrec, count, s_min,
-                          state_global, observablesHistory);
-        }
-        else
-        {
-            /* If energy is not smaller make the step smaller...  */
-            ustep *= 0.5;
-
-            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-            {
-                /* Reload the old state */
-                em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec, imdSession,
-                                       pull_work, s_min, &top, mdAtoms, fr, vsite, constr, nrnb, wcycle);
-            }
-        }
-
-        // If the force is very small after finishing minimization,
-        // we risk dividing by zero when calculating the step size.
-        // So we check first if the minimization has stopped before
-        // trying to obtain a new step size.
-        if (!bDone)
-        {
-            /* Determine new step  */
-            stepsize = ustep / s_min->fmax;
-        }
-
-        /* Check if stepsize is too small, with 1 nm as a characteristic length */
-#if GMX_DOUBLE
-        if (count == nsteps || ustep < 1e-12)
-#else
-        if (count == nsteps || ustep < 1e-6)
-#endif
-        {
-            if (MASTER(cr))
-            {
-                warn_step(fplog, inputrec->em_tol, s_min->fmax, count == nsteps, constr != nullptr);
-            }
-            bAbort = TRUE;
-        }
-
-        /* Send IMD energies and positions, if bIMD is TRUE. */
-        if (imdSession->run(count, TRUE, MASTER(cr) ? state_global->box : nullptr,
-                            MASTER(cr) ? state_global->x.rvec_array() : nullptr, 0)
-            && MASTER(cr))
-        {
-            imdSession->sendPositionsAndEnergies();
-        }
-
-        count++;
-    } /* End of the loop  */
-
-    /* Print some data...  */
-    if (MASTER(cr))
-    {
-        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-    }
-    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout != 0, ftp2fn(efSTO, nfile, fnm),
-                  top_global, inputrec, count, s_min, state_global, observablesHistory);
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-
-        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps, s_min, sqrtNumAtoms);
-        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps, s_min, sqrtNumAtoms);
-    }
-
-    finish_em(cr, outf, walltime_accounting, wcycle);
-
-    /* To print the actual number of steps we needed somewhere */
-    inputrec->nsteps = count;
-
-    walltime_accounting_set_nsteps_done(walltime_accounting, count);
-}
-
-void LegacySimulator::do_nm()
-{
-    const char*         NM = "Normal Mode Analysis";
-    int                 nnodes;
-    gmx_localtop_t      top(top_global->ffparams);
-    gmx_global_stat_t   gstat;
-    tensor              vir, pres;
-    rvec                mu_tot = { 0 };
-    rvec*               dfdx;
-    gmx_bool            bSparse; /* use sparse matrix storage format */
-    size_t              sz;
-    gmx_sparsematrix_t* sparse_matrix = nullptr;
-    real*               full_matrix   = nullptr;
-
-    /* added with respect to mdrun */
-    int  row, col;
-    real der_range = 10.0 * std::sqrt(GMX_REAL_EPS);
-    real x_min;
-    bool bIsMaster = MASTER(cr);
-    auto mdatoms   = mdAtoms->mdatoms();
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that activating normal-mode analysis via the integrator "
-                    ".mdp option and the command gmx mdrun may "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx normal-modes.");
-
-    if (constr != nullptr)
-    {
-        gmx_fatal(
-                FARGS,
-                "Constraints present with Normal Mode Analysis, this combination is not supported");
-    }
-
-    gmx_shellfc_t* shellfc;
-
-    em_state_t state_work{};
-
-    /* Init em and store the local state in state_minimum */
-    init_em(fplog, mdlog, NM, cr, ms /*PLUMED*/, inputrec, imdSession, pull_work, state_global, top_global,
-            &state_work, &top, nrnb, fr, mdAtoms, &gstat, vsite, constr, &shellfc);
-    const bool  simulationsShareState = false;
-    gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-
-    std::vector<int>       atom_index = get_atom_index(top_global);
-    std::vector<gmx::RVec> fneg(atom_index.size(), { 0, 0, 0 });
-    snew(dfdx, atom_index.size());
-
-#if !GMX_DOUBLE
-    if (bIsMaster)
-    {
-        fprintf(stderr,
-                "NOTE: This version of GROMACS has been compiled in single precision,\n"
-                "      which MIGHT not be accurate enough for normal mode analysis.\n"
-                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
-                "      are fairly modest even if you recompile in double precision.\n\n");
-    }
-#endif
-
-    /* Check if we can/should use sparse storage format.
-     *
-     * Sparse format is only useful when the Hessian itself is sparse, which it
-     * will be when we use a cutoff.
-     * For small systems (n<1000) it is easier to always use full matrix format, though.
-     */
-    if (EEL_FULL(fr->ic->eeltype) || fr->rlist == 0.0)
-    {
-        GMX_LOG(mdlog.warning)
-                .appendText("Non-cutoff electrostatics used, forcing full Hessian format.");
-        bSparse = FALSE;
-    }
-    else if (atom_index.size() < 1000)
-    {
-        GMX_LOG(mdlog.warning)
-                .appendTextFormatted("Small system size (N=%zu), using full Hessian format.",
-                                     atom_index.size());
-        bSparse = FALSE;
-    }
-    else
-    {
-        GMX_LOG(mdlog.warning).appendText("Using compressed symmetric sparse Hessian format.");
-        bSparse = TRUE;
-    }
-
-    /* Number of dimensions, based on real atoms, that is not vsites or shell */
-    sz = DIM * atom_index.size();
-
-    fprintf(stderr, "Allocating Hessian memory...\n\n");
-
-    if (bSparse)
-    {
-        sparse_matrix                       = gmx_sparsematrix_init(sz);
-        sparse_matrix->compressed_symmetric = TRUE;
-    }
-    else
-    {
-        snew(full_matrix, sz * sz);
-    }
-
-    /* Write start time and temperature */
-    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
-
-    /* fudge nr of steps to nr of atoms */
-    inputrec->nsteps = atom_index.size() * 2;
-
-    if (bIsMaster)
-    {
-        fprintf(stderr, "starting normal mode calculation '%s'\n%" PRId64 " steps.\n\n",
-                *(top_global->name), inputrec->nsteps);
-    }
-
-    nnodes = cr->nnodes;
-
-    /* Make evaluate_energy do a single node force calculation */
-    cr->nnodes = 1;
-    EnergyEvaluator energyEvaluator{ fplog,    mdlog,      cr,        ms,   top_global,      &top,
-                                     inputrec, imdSession, pull_work, nrnb, wcycle,          gstat,
-                                     vsite,    constr,     mdAtoms,   fr,   runScheduleWork, enerd };
-    energyEvaluator.run(&state_work, mu_tot, vir, pres, -1, TRUE);
-    cr->nnodes = nnodes;
-
-    /* if forces are not small, warn user */
-    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, &state_work);
-
-    GMX_LOG(mdlog.warning).appendTextFormatted("Maximum force:%12.5e", state_work.fmax);
-    if (state_work.fmax > 1.0e-3)
-    {
-        GMX_LOG(mdlog.warning)
-                .appendText(
-                        "The force is probably not small enough to "
-                        "ensure that you are at a minimum.\n"
-                        "Be aware that negative eigenvalues may occur\n"
-                        "when the resulting matrix is diagonalized.");
-    }
-
-    /***********************************************************
-     *
-     *      Loop over all pairs in matrix
-     *
-     *      do_force called twice. Once with positive and
-     *      once with negative displacement
-     *
-     ************************************************************/
-
-    /* Steps are divided one by one over the nodes */
-    bool bNS          = true;
-    auto state_work_x = makeArrayRef(state_work.s.x);
-    auto state_work_f = state_work.f.view().force();
-    for (index aid = cr->nodeid; aid < ssize(atom_index); aid += nnodes)
-    {
-        size_t atom = atom_index[aid];
-        for (size_t d = 0; d < DIM; d++)
-        {
-            int64_t step        = 0;
-            int     force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
-            double  t           = 0;
-
-            x_min = state_work_x[atom][d];
-
-            for (unsigned int dx = 0; (dx < 2); dx++)
-            {
-                if (dx == 0)
-                {
-                    state_work_x[atom][d] = x_min - der_range;
-                }
-                else
-                {
-                    state_work_x[atom][d] = x_min + der_range;
-                }
-
-                /* Make evaluate_energy do a single node force calculation */
-                cr->nnodes = 1;
-                if (shellfc)
-                {
-                    /* Now is the time to relax the shells */
-                    relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, nullptr, step, inputrec,
-                                        imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
-                                        state_work.s.natoms, state_work.s.x.arrayRefWithPadding(),
-                                        state_work.s.v.arrayRefWithPadding(), state_work.s.box,
-                                        state_work.s.lambda, &state_work.s.hist, &state_work.f.view(),
-                                        vir, mdatoms, nrnb, wcycle, shellfc, fr, runScheduleWork, t,
-                                        mu_tot, vsite, DDBalanceRegionHandler(nullptr));
-                    bNS = false;
-                    step++;
-                }
-                else
-                {
-                    energyEvaluator.run(&state_work, mu_tot, vir, pres, aid * 2 + dx, FALSE);
-                }
-
-                cr->nnodes = nnodes;
-
-                if (dx == 0)
-                {
-                    std::copy(state_work_f.begin(), state_work_f.begin() + atom_index.size(),
-                              fneg.begin());
-                }
-            }
-
-            /* x is restored to original */
-            state_work_x[atom][d] = x_min;
-
-            for (size_t j = 0; j < atom_index.size(); j++)
-            {
-                for (size_t k = 0; (k < DIM); k++)
-                {
-                    dfdx[j][k] = -(state_work_f[atom_index[j]][k] - fneg[j][k]) / (2 * der_range);
-                }
-            }
-
-            if (!bIsMaster)
-            {
-#if GMX_MPI
-#    define mpi_type GMX_MPI_REAL
-                MPI_Send(dfdx[0], atom_index.size() * DIM, mpi_type, MASTER(cr), cr->nodeid,
-                         cr->mpi_comm_mygroup);
-#endif
-            }
-            else
-            {
-                for (index node = 0; (node < nnodes && aid + node < ssize(atom_index)); node++)
-                {
-                    if (node > 0)
-                    {
-#if GMX_MPI
-                        MPI_Status stat;
-                        MPI_Recv(dfdx[0], atom_index.size() * DIM, mpi_type, node, node,
-                                 cr->mpi_comm_mygroup, &stat);
-#    undef mpi_type
-#endif
-                    }
-
-                    row = (aid + node) * DIM + d;
-
-                    for (size_t j = 0; j < atom_index.size(); j++)
-                    {
-                        for (size_t k = 0; k < DIM; k++)
-                        {
-                            col = j * DIM + k;
-
-                            if (bSparse)
-                            {
-                                if (col >= row && dfdx[j][k] != 0.0)
-                                {
-                                    gmx_sparsematrix_increment_value(sparse_matrix, row, col, dfdx[j][k]);
-                                }
-                            }
-                            else
-                            {
-                                full_matrix[row * sz + col] = dfdx[j][k];
-                            }
-                        }
-                    }
-                }
-            }
-
-            if (mdrunOptions.verbose && fplog)
-            {
-                fflush(fplog);
-            }
-        }
-        /* write progress */
-        if (bIsMaster && mdrunOptions.verbose)
-        {
-            fprintf(stderr, "\rFinished step %d out of %td",
-                    std::min<int>(atom + nnodes, atom_index.size()), ssize(atom_index));
-            fflush(stderr);
-        }
-    }
-
-    if (bIsMaster)
-    {
-        fprintf(stderr, "\n\nWriting Hessian...\n");
-        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
-    }
-
-    finish_em(cr, outf, walltime_accounting, wcycle);
-
-    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size() * 2);
-}
-
-} // namespace gmx
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/minimize.cpp.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/minimize.cpp.preplumed
deleted file mode 100644
index b6b9376e3a..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/minimize.cpp.preplumed
+++ /dev/null
@@ -1,2880 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017 The GROMACS development team.
- * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief This file defines integrators for energy minimization
- *
- * \author Berk Hess <hess@kth.se>
- * \author Erik Lindahl <erik@kth.se>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include "config.h"
-
-#include <cmath>
-#include <cstring>
-#include <ctime>
-
-#include <algorithm>
-#include <limits>
-#include <vector>
-
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/domdec/collect.h"
-#include "gromacs/domdec/dlbtiming.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/mdsetup.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/ewald/pme_pp.h"
-#include "gromacs/fileio/confio.h"
-#include "gromacs/fileio/mtxio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/imd/imd.h"
-#include "gromacs/linearalgebra/sparsematrix.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/coupling.h"
-#include "gromacs/mdlib/dispersioncorrection.h"
-#include "gromacs/mdlib/ebin.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/energyoutput.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/md_support.h"
-#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/stat.h"
-#include "gromacs/mdlib/tgroup.h"
-#include "gromacs/mdlib/trajectory_writing.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdrunutility/handlerestart.h"
-#include "gromacs/mdrunutility/printtime.h"
-#include "gromacs/mdtypes/checkpointdata.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/forcebuffers.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/interaction_const.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/walltime_accounting.h"
-#include "gromacs/topology/mtop_util.h"
-#include "gromacs/topology/topology.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/exceptions.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/smalloc.h"
-
-#include "legacysimulator.h"
-#include "shellfc.h"
-
-using gmx::ArrayRef;
-using gmx::MdrunScheduleWorkload;
-using gmx::RVec;
-using gmx::VirtualSitesHandler;
-
-//! Utility structure for manipulating states during EM
-typedef struct em_state
-{
-    //! Copy of the global state
-    t_state s;
-    //! Force array
-    gmx::ForceBuffers f;
-    //! Potential energy
-    real epot;
-    //! Norm of the force
-    real fnorm;
-    //! Maximum force
-    real fmax;
-    //! Direction
-    int a_fmax;
-} em_state_t;
-
-//! Print the EM starting conditions
-static void print_em_start(FILE*                     fplog,
-                           const t_commrec*          cr,
-                           gmx_walltime_accounting_t walltime_accounting,
-                           gmx_wallcycle_t           wcycle,
-                           const char*               name)
-{
-    walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
-    print_start(fplog, cr, walltime_accounting, name);
-}
-
-//! Stop counting time for EM
-static void em_time_end(gmx_walltime_accounting_t walltime_accounting, gmx_wallcycle_t wcycle)
-{
-    wallcycle_stop(wcycle, ewcRUN);
-
-    walltime_accounting_end_time(walltime_accounting);
-}
-
-//! Printing a log file and console header
-static void sp_header(FILE* out, const char* minimizer, real ftol, int nsteps)
-{
-    fprintf(out, "\n");
-    fprintf(out, "%s:\n", minimizer);
-    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
-    fprintf(out, "   Number of steps    = %12d\n", nsteps);
-}
-
-//! Print warning message
-static void warn_step(FILE* fp, real ftol, real fmax, gmx_bool bLastStep, gmx_bool bConstrain)
-{
-    constexpr bool realIsDouble = GMX_DOUBLE;
-    char           buffer[2048];
-
-    if (!std::isfinite(fmax))
-    {
-        sprintf(buffer,
-                "\nEnergy minimization has stopped because the force "
-                "on at least one atom is not finite. This usually means "
-                "atoms are overlapping. Modify the input coordinates to "
-                "remove atom overlap or use soft-core potentials with "
-                "the free energy code to avoid infinite forces.\n%s",
-                !realIsDouble ? "You could also be lucky that switching to double precision "
-                                "is sufficient to obtain finite forces.\n"
-                              : "");
-    }
-    else if (bLastStep)
-    {
-        sprintf(buffer,
-                "\nEnergy minimization reached the maximum number "
-                "of steps before the forces reached the requested "
-                "precision Fmax < %g.\n",
-                ftol);
-    }
-    else
-    {
-        sprintf(buffer,
-                "\nEnergy minimization has stopped, but the forces have "
-                "not converged to the requested precision Fmax < %g (which "
-                "may not be possible for your system). It stopped "
-                "because the algorithm tried to make a new step whose size "
-                "was too small, or there was no change in the energy since "
-                "last step. Either way, we regard the minimization as "
-                "converged to within the available machine precision, "
-                "given your starting configuration and EM parameters.\n%s%s",
-                ftol,
-                !realIsDouble ? "\nDouble precision normally gives you higher accuracy, but "
-                                "this is often not needed for preparing to run molecular "
-                                "dynamics.\n"
-                              : "",
-                bConstrain ? "You might need to increase your constraint accuracy, or turn\n"
-                             "off constraints altogether (set constraints = none in mdp file)\n"
-                           : "");
-    }
-
-    fputs(wrap_lines(buffer, 78, 0, FALSE), stderr);
-    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
-}
-
-//! Print message about convergence of the EM
-static void print_converged(FILE*             fp,
-                            const char*       alg,
-                            real              ftol,
-                            int64_t           count,
-                            gmx_bool          bDone,
-                            int64_t           nsteps,
-                            const em_state_t* ems,
-                            double            sqrtNumAtoms)
-{
-    char buf[STEPSTRSIZE];
-
-    if (bDone)
-    {
-        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n", alg, ftol, gmx_step_str(count, buf));
-    }
-    else if (count < nsteps)
-    {
-        fprintf(fp,
-                "\n%s converged to machine precision in %s steps,\n"
-                "but did not reach the requested Fmax < %g.\n",
-                alg, gmx_step_str(count, buf), ftol);
-    }
-    else
-    {
-        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n", alg, ftol,
-                gmx_step_str(count, buf));
-    }
-
-#if GMX_DOUBLE
-    fprintf(fp, "Potential Energy  = %21.14e\n", ems->epot);
-    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", ems->fmax, ems->a_fmax + 1);
-    fprintf(fp, "Norm of force     = %21.14e\n", ems->fnorm / sqrtNumAtoms);
-#else
-    fprintf(fp, "Potential Energy  = %14.7e\n", ems->epot);
-    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", ems->fmax, ems->a_fmax + 1);
-    fprintf(fp, "Norm of force     = %14.7e\n", ems->fnorm / sqrtNumAtoms);
-#endif
-}
-
-//! Compute the norm and max of the force array in parallel
-static void get_f_norm_max(const t_commrec*               cr,
-                           t_grpopts*                     opts,
-                           t_mdatoms*                     mdatoms,
-                           gmx::ArrayRef<const gmx::RVec> f,
-                           real*                          fnorm,
-                           real*                          fmax,
-                           int*                           a_fmax)
-{
-    double fnorm2, *sum;
-    real   fmax2, fam;
-    int    la_max, a_max, start, end, i, m, gf;
-
-    /* This routine finds the largest force and returns it.
-     * On parallel machines the global max is taken.
-     */
-    fnorm2 = 0;
-    fmax2  = 0;
-    la_max = -1;
-    start  = 0;
-    end    = mdatoms->homenr;
-    if (mdatoms->cFREEZE)
-    {
-        for (i = start; i < end; i++)
-        {
-            gf  = mdatoms->cFREEZE[i];
-            fam = 0;
-            for (m = 0; m < DIM; m++)
-            {
-                if (!opts->nFreeze[gf][m])
-                {
-                    fam += gmx::square(f[i][m]);
-                }
-            }
-            fnorm2 += fam;
-            if (fam > fmax2)
-            {
-                fmax2  = fam;
-                la_max = i;
-            }
-        }
-    }
-    else
-    {
-        for (i = start; i < end; i++)
-        {
-            fam = norm2(f[i]);
-            fnorm2 += fam;
-            if (fam > fmax2)
-            {
-                fmax2  = fam;
-                la_max = i;
-            }
-        }
-    }
-
-    if (la_max >= 0 && DOMAINDECOMP(cr))
-    {
-        a_max = cr->dd->globalAtomIndices[la_max];
-    }
-    else
-    {
-        a_max = la_max;
-    }
-    if (PAR(cr))
-    {
-        snew(sum, 2 * cr->nnodes + 1);
-        sum[2 * cr->nodeid]     = fmax2;
-        sum[2 * cr->nodeid + 1] = a_max;
-        sum[2 * cr->nnodes]     = fnorm2;
-        gmx_sumd(2 * cr->nnodes + 1, sum, cr);
-        fnorm2 = sum[2 * cr->nnodes];
-        /* Determine the global maximum */
-        for (i = 0; i < cr->nnodes; i++)
-        {
-            if (sum[2 * i] > fmax2)
-            {
-                fmax2 = sum[2 * i];
-                a_max = gmx::roundToInt(sum[2 * i + 1]);
-            }
-        }
-        sfree(sum);
-    }
-
-    if (fnorm)
-    {
-        *fnorm = sqrt(fnorm2);
-    }
-    if (fmax)
-    {
-        *fmax = sqrt(fmax2);
-    }
-    if (a_fmax)
-    {
-        *a_fmax = a_max;
-    }
-}
-
-//! Compute the norm of the force
-static void get_state_f_norm_max(const t_commrec* cr, t_grpopts* opts, t_mdatoms* mdatoms, em_state_t* ems)
-{
-    get_f_norm_max(cr, opts, mdatoms, ems->f.view().force(), &ems->fnorm, &ems->fmax, &ems->a_fmax);
-}
-
-//! Initialize the energy minimization
-static void init_em(FILE*                fplog,
-                    const gmx::MDLogger& mdlog,
-                    const char*          title,
-                    const t_commrec*     cr,
-                    t_inputrec*          ir,
-                    gmx::ImdSession*     imdSession,
-                    pull_t*              pull_work,
-                    t_state*             state_global,
-                    const gmx_mtop_t*    top_global,
-                    em_state_t*          ems,
-                    gmx_localtop_t*      top,
-                    t_nrnb*              nrnb,
-                    t_forcerec*          fr,
-                    gmx::MDAtoms*        mdAtoms,
-                    gmx_global_stat_t*   gstat,
-                    VirtualSitesHandler* vsite,
-                    gmx::Constraints*    constr,
-                    gmx_shellfc_t**      shellfc)
-{
-    real dvdl_constr;
-
-    if (fplog)
-    {
-        fprintf(fplog, "Initiating %s\n", title);
-    }
-
-    if (MASTER(cr))
-    {
-        state_global->ngtc = 0;
-    }
-    int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
-    gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
-    initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
-
-    if (ir->eI == eiNM)
-    {
-        GMX_ASSERT(shellfc != nullptr, "With NM we always support shells");
-
-        *shellfc =
-                init_shell_flexcon(stdout, top_global, constr ? constr->numFlexibleConstraints() : 0,
-                                   ir->nstcalcenergy, DOMAINDECOMP(cr), thisRankHasDuty(cr, DUTY_PME));
-    }
-    else
-    {
-        GMX_ASSERT(EI_ENERGY_MINIMIZATION(ir->eI),
-                   "This else currently only handles energy minimizers, consider if your algorithm "
-                   "needs shell/flexible-constraint support");
-
-        /* With energy minimization, shells and flexible constraints are
-         * automatically minimized when treated like normal DOFS.
-         */
-        if (shellfc != nullptr)
-        {
-            *shellfc = nullptr;
-        }
-    }
-
-    if (DOMAINDECOMP(cr))
-    {
-        dd_init_local_state(cr->dd, state_global, &ems->s);
-
-        /* Distribute the charge groups over the nodes from the master node */
-        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
-                            imdSession, pull_work, &ems->s, &ems->f, mdAtoms, top, fr, vsite,
-                            constr, nrnb, nullptr, FALSE);
-        dd_store_state(cr->dd, &ems->s);
-    }
-    else
-    {
-        state_change_natoms(state_global, state_global->natoms);
-        /* Just copy the state */
-        ems->s = *state_global;
-        state_change_natoms(&ems->s, ems->s.natoms);
-
-        mdAlgorithmsSetupAtomData(cr, ir, *top_global, top, fr, &ems->f, mdAtoms, constr, vsite,
-                                  shellfc ? *shellfc : nullptr);
-    }
-
-    update_mdatoms(mdAtoms->mdatoms(), ems->s.lambda[efptMASS]);
-
-    if (constr)
-    {
-        // TODO how should this cross-module support dependency be managed?
-        if (ir->eConstrAlg == econtSHAKE && gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
-        {
-            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
-                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
-        }
-
-        if (!ir->bContinuation)
-        {
-            /* Constrain the starting coordinates */
-            bool needsLogging  = true;
-            bool computeEnergy = true;
-            bool computeVirial = false;
-            dvdl_constr        = 0;
-            constr->apply(needsLogging, computeEnergy, -1, 0, 1.0, ems->s.x.arrayRefWithPadding(),
-                          ems->s.x.arrayRefWithPadding(), ArrayRef<RVec>(), ems->s.box,
-                          ems->s.lambda[efptFEP], &dvdl_constr, gmx::ArrayRefWithPadding<RVec>(),
-                          computeVirial, nullptr, gmx::ConstraintVariable::Positions);
-        }
-    }
-
-    if (PAR(cr))
-    {
-        *gstat = global_stat_init(ir);
-    }
-    else
-    {
-        *gstat = nullptr;
-    }
-
-    calc_shifts(ems->s.box, fr->shift_vec);
-}
-
-//! Finalize the minimization
-static void finish_em(const t_commrec*          cr,
-                      gmx_mdoutf_t              outf,
-                      gmx_walltime_accounting_t walltime_accounting,
-                      gmx_wallcycle_t           wcycle)
-{
-    if (!thisRankHasDuty(cr, DUTY_PME))
-    {
-        /* Tell the PME only node to finish */
-        gmx_pme_send_finish(cr);
-    }
-
-    done_mdoutf(outf);
-
-    em_time_end(walltime_accounting, wcycle);
-}
-
-//! Swap two different EM states during minimization
-static void swap_em_state(em_state_t** ems1, em_state_t** ems2)
-{
-    em_state_t* tmp;
-
-    tmp   = *ems1;
-    *ems1 = *ems2;
-    *ems2 = tmp;
-}
-
-//! Save the EM trajectory
-static void write_em_traj(FILE*               fplog,
-                          const t_commrec*    cr,
-                          gmx_mdoutf_t        outf,
-                          gmx_bool            bX,
-                          gmx_bool            bF,
-                          const char*         confout,
-                          const gmx_mtop_t*   top_global,
-                          t_inputrec*         ir,
-                          int64_t             step,
-                          em_state_t*         state,
-                          t_state*            state_global,
-                          ObservablesHistory* observablesHistory)
-{
-    int mdof_flags = 0;
-
-    if (bX)
-    {
-        mdof_flags |= MDOF_X;
-    }
-    if (bF)
-    {
-        mdof_flags |= MDOF_F;
-    }
-
-    /* If we want IMD output, set appropriate MDOF flag */
-    if (ir->bIMD)
-    {
-        mdof_flags |= MDOF_IMD;
-    }
-
-    gmx::WriteCheckpointDataHolder checkpointDataHolder;
-    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags, top_global->natoms, step,
-                                     static_cast<double>(step), &state->s, state_global,
-                                     observablesHistory, state->f.view().force(), &checkpointDataHolder);
-
-    if (confout != nullptr)
-    {
-        if (DOMAINDECOMP(cr))
-        {
-            /* If bX=true, x was collected to state_global in the call above */
-            if (!bX)
-            {
-                auto globalXRef = MASTER(cr) ? state_global->x : gmx::ArrayRef<gmx::RVec>();
-                dd_collect_vec(cr->dd, state->s.ddp_count, state->s.ddp_count_cg_gl, state->s.cg_gl,
-                               state->s.x, globalXRef);
-            }
-        }
-        else
-        {
-            /* Copy the local state pointer */
-            state_global = &state->s;
-        }
-
-        if (MASTER(cr))
-        {
-            if (ir->pbcType != PbcType::No && !ir->bPeriodicMols && DOMAINDECOMP(cr))
-            {
-                /* Make molecules whole only for confout writing */
-                do_pbc_mtop(ir->pbcType, state->s.box, top_global, state_global->x.rvec_array());
-            }
-
-            write_sto_conf_mtop(confout, *top_global->name, top_global,
-                                state_global->x.rvec_array(), nullptr, ir->pbcType, state->s.box);
-        }
-    }
-}
-
-//! \brief Do one minimization step
-//
-// \returns true when the step succeeded, false when a constraint error occurred
-static bool do_em_step(const t_commrec*                          cr,
-                       t_inputrec*                               ir,
-                       t_mdatoms*                                md,
-                       em_state_t*                               ems1,
-                       real                                      a,
-                       gmx::ArrayRefWithPadding<const gmx::RVec> force,
-                       em_state_t*                               ems2,
-                       gmx::Constraints*                         constr,
-                       int64_t                                   count)
-
-{
-    t_state *s1, *s2;
-    int      start, end;
-    real     dvdl_constr;
-    int nthreads gmx_unused;
-
-    bool validStep = true;
-
-    s1 = &ems1->s;
-    s2 = &ems2->s;
-
-    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
-    {
-        gmx_incons("state mismatch in do_em_step");
-    }
-
-    s2->flags = s1->flags;
-
-    if (s2->natoms != s1->natoms)
-    {
-        state_change_natoms(s2, s1->natoms);
-        ems2->f.resize(s2->natoms);
-    }
-    if (DOMAINDECOMP(cr) && s2->cg_gl.size() != s1->cg_gl.size())
-    {
-        s2->cg_gl.resize(s1->cg_gl.size());
-    }
-
-    copy_mat(s1->box, s2->box);
-    /* Copy free energy state */
-    s2->lambda = s1->lambda;
-    copy_mat(s1->box, s2->box);
-
-    start = 0;
-    end   = md->homenr;
-
-    nthreads = gmx_omp_nthreads_get(emntUpdate);
-#pragma omp parallel num_threads(nthreads)
-    {
-        const rvec* x1 = s1->x.rvec_array();
-        rvec*       x2 = s2->x.rvec_array();
-        const rvec* f  = as_rvec_array(force.unpaddedArrayRef().data());
-
-        int gf = 0;
-#pragma omp for schedule(static) nowait
-        for (int i = start; i < end; i++)
-        {
-            try
-            {
-                if (md->cFREEZE)
-                {
-                    gf = md->cFREEZE[i];
-                }
-                for (int m = 0; m < DIM; m++)
-                {
-                    if (ir->opts.nFreeze[gf][m])
-                    {
-                        x2[i][m] = x1[i][m];
-                    }
-                    else
-                    {
-                        x2[i][m] = x1[i][m] + a * f[i][m];
-                    }
-                }
-            }
-            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-        }
-
-        if (s2->flags & (1 << estCGP))
-        {
-            /* Copy the CG p vector */
-            const rvec* p1 = s1->cg_p.rvec_array();
-            rvec*       p2 = s2->cg_p.rvec_array();
-#pragma omp for schedule(static) nowait
-            for (int i = start; i < end; i++)
-            {
-                // Trivial OpenMP block that does not throw
-                copy_rvec(p1[i], p2[i]);
-            }
-        }
-
-        if (DOMAINDECOMP(cr))
-        {
-            /* OpenMP does not supported unsigned loop variables */
-#pragma omp for schedule(static) nowait
-            for (gmx::index i = 0; i < gmx::ssize(s2->cg_gl); i++)
-            {
-                s2->cg_gl[i] = s1->cg_gl[i];
-            }
-        }
-    }
-
-    if (DOMAINDECOMP(cr))
-    {
-        s2->ddp_count       = s1->ddp_count;
-        s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
-    }
-
-    if (constr)
-    {
-        dvdl_constr = 0;
-        validStep   = constr->apply(
-                TRUE, TRUE, count, 0, 1.0, s1->x.arrayRefWithPadding(), s2->x.arrayRefWithPadding(),
-                ArrayRef<RVec>(), s2->box, s2->lambda[efptBONDED], &dvdl_constr,
-                gmx::ArrayRefWithPadding<RVec>(), false, nullptr, gmx::ConstraintVariable::Positions);
-
-        if (cr->nnodes > 1)
-        {
-            /* This global reduction will affect performance at high
-             * parallelization, but we can not really avoid it.
-             * But usually EM is not run at high parallelization.
-             */
-            int reductionBuffer = static_cast<int>(!validStep);
-            gmx_sumi(1, &reductionBuffer, cr);
-            validStep = (reductionBuffer == 0);
-        }
-
-        // We should move this check to the different minimizers
-        if (!validStep && ir->eI != eiSteep)
-        {
-            gmx_fatal(FARGS,
-                      "The coordinates could not be constrained. Minimizer '%s' can not handle "
-                      "constraint failures, use minimizer '%s' before using '%s'.",
-                      EI(ir->eI), EI(eiSteep), EI(ir->eI));
-        }
-    }
-
-    return validStep;
-}
-
-//! Prepare EM for using domain decomposition parallellization
-static void em_dd_partition_system(FILE*                fplog,
-                                   const gmx::MDLogger& mdlog,
-                                   int                  step,
-                                   const t_commrec*     cr,
-                                   const gmx_mtop_t*    top_global,
-                                   t_inputrec*          ir,
-                                   gmx::ImdSession*     imdSession,
-                                   pull_t*              pull_work,
-                                   em_state_t*          ems,
-                                   gmx_localtop_t*      top,
-                                   gmx::MDAtoms*        mdAtoms,
-                                   t_forcerec*          fr,
-                                   VirtualSitesHandler* vsite,
-                                   gmx::Constraints*    constr,
-                                   t_nrnb*              nrnb,
-                                   gmx_wallcycle_t      wcycle)
-{
-    /* Repartition the domain decomposition */
-    dd_partition_system(fplog, mdlog, step, cr, FALSE, 1, nullptr, *top_global, ir, imdSession, pull_work,
-                        &ems->s, &ems->f, mdAtoms, top, fr, vsite, constr, nrnb, wcycle, FALSE);
-    dd_store_state(cr->dd, &ems->s);
-}
-
-namespace
-{
-
-/*! \brief Class to handle the work of setting and doing an energy evaluation.
- *
- * This class is a mere aggregate of parameters to pass to evaluate an
- * energy, so that future changes to names and types of them consume
- * less time when refactoring other code.
- *
- * Aggregate initialization is used, for which the chief risk is that
- * if a member is added at the end and not all initializer lists are
- * updated, then the member will be value initialized, which will
- * typically mean initialization to zero.
- *
- * Use a braced initializer list to construct one of these. */
-class EnergyEvaluator
-{
-public:
-    /*! \brief Evaluates an energy on the state in \c ems.
-     *
-     * \todo In practice, the same objects mu_tot, vir, and pres
-     * are always passed to this function, so we would rather have
-     * them as data members. However, their C-array types are
-     * unsuited for aggregate initialization. When the types
-     * improve, the call signature of this method can be reduced.
-     */
-    void run(em_state_t* ems, rvec mu_tot, tensor vir, tensor pres, int64_t count, gmx_bool bFirst);
-    //! Handles logging (deprecated).
-    FILE* fplog;
-    //! Handles logging.
-    const gmx::MDLogger& mdlog;
-    //! Handles communication.
-    const t_commrec* cr;
-    //! Coordinates multi-simulations.
-    const gmx_multisim_t* ms;
-    //! Holds the simulation topology.
-    const gmx_mtop_t* top_global;
-    //! Holds the domain topology.
-    gmx_localtop_t* top;
-    //! User input options.
-    t_inputrec* inputrec;
-    //! The Interactive Molecular Dynamics session.
-    gmx::ImdSession* imdSession;
-    //! The pull work object.
-    pull_t* pull_work;
-    //! Manages flop accounting.
-    t_nrnb* nrnb;
-    //! Manages wall cycle accounting.
-    gmx_wallcycle_t wcycle;
-    //! Coordinates global reduction.
-    gmx_global_stat_t gstat;
-    //! Handles virtual sites.
-    VirtualSitesHandler* vsite;
-    //! Handles constraints.
-    gmx::Constraints* constr;
-    //! Per-atom data for this domain.
-    gmx::MDAtoms* mdAtoms;
-    //! Handles how to calculate the forces.
-    t_forcerec* fr;
-    //! Schedule of force-calculation work each step for this task.
-    MdrunScheduleWorkload* runScheduleWork;
-    //! Stores the computed energies.
-    gmx_enerdata_t* enerd;
-};
-
-void EnergyEvaluator::run(em_state_t* ems, rvec mu_tot, tensor vir, tensor pres, int64_t count, gmx_bool bFirst)
-{
-    real     t;
-    gmx_bool bNS;
-    tensor   force_vir, shake_vir, ekin;
-    real     dvdl_constr;
-    real     terminate = 0;
-
-    /* Set the time to the initial time, the time does not change during EM */
-    t = inputrec->init_t;
-
-    if (bFirst || (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
-    {
-        /* This is the first state or an old state used before the last ns */
-        bNS = TRUE;
-    }
-    else
-    {
-        bNS = FALSE;
-        if (inputrec->nstlist > 0)
-        {
-            bNS = TRUE;
-        }
-    }
-
-    if (vsite)
-    {
-        vsite->construct(ems->s.x, 1, {}, ems->s.box);
-    }
-
-    if (DOMAINDECOMP(cr) && bNS)
-    {
-        /* Repartition the domain decomposition */
-        em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec, imdSession, pull_work,
-                               ems, top, mdAtoms, fr, vsite, constr, nrnb, wcycle);
-    }
-
-    /* Calc force & energy on new trial position  */
-    /* do_force always puts the charge groups in the box and shifts again
-     * We do not unshift, so molecules are always whole in congrad.c
-     */
-    do_force(fplog, cr, ms, inputrec, nullptr, nullptr, imdSession, pull_work, count, nrnb, wcycle,
-             top, ems->s.box, ems->s.x.arrayRefWithPadding(), &ems->s.hist, &ems->f.view(), force_vir,
-             mdAtoms->mdatoms(), enerd, ems->s.lambda, fr, runScheduleWork, vsite, mu_tot, t, nullptr,
-             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES | GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY
-                     | (bNS ? GMX_FORCE_NS : 0),
-             DDBalanceRegionHandler(cr));
-
-    /* Clear the unused shake virial and pressure */
-    clear_mat(shake_vir);
-    clear_mat(pres);
-
-    /* Communicate stuff when parallel */
-    if (PAR(cr) && inputrec->eI != eiNM)
-    {
-        wallcycle_start(wcycle, ewcMoveE);
-
-        global_stat(gstat, cr, enerd, force_vir, shake_vir, inputrec, nullptr, nullptr, nullptr, 1,
-                    &terminate, nullptr, FALSE, CGLO_ENERGY | CGLO_PRESSURE | CGLO_CONSTRAINT);
-
-        wallcycle_stop(wcycle, ewcMoveE);
-    }
-
-    if (fr->dispersionCorrection)
-    {
-        /* Calculate long range corrections to pressure and energy */
-        const DispersionCorrection::Correction correction =
-                fr->dispersionCorrection->calculate(ems->s.box, ems->s.lambda[efptVDW]);
-
-        enerd->term[F_DISPCORR] = correction.energy;
-        enerd->term[F_EPOT] += correction.energy;
-        enerd->term[F_PRES] += correction.pressure;
-        enerd->term[F_DVDL] += correction.dvdl;
-    }
-    else
-    {
-        enerd->term[F_DISPCORR] = 0;
-    }
-
-    ems->epot = enerd->term[F_EPOT];
-
-    if (constr)
-    {
-        /* Project out the constraint components of the force */
-        bool needsLogging  = false;
-        bool computeEnergy = false;
-        bool computeVirial = true;
-        dvdl_constr        = 0;
-        auto f             = ems->f.view().forceWithPadding();
-        constr->apply(needsLogging, computeEnergy, count, 0, 1.0, ems->s.x.arrayRefWithPadding(), f,
-                      f.unpaddedArrayRef(), ems->s.box, ems->s.lambda[efptBONDED], &dvdl_constr,
-                      gmx::ArrayRefWithPadding<RVec>(), computeVirial, shake_vir,
-                      gmx::ConstraintVariable::ForceDispl);
-        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-        m_add(force_vir, shake_vir, vir);
-    }
-    else
-    {
-        copy_mat(force_vir, vir);
-    }
-
-    clear_mat(ekin);
-    enerd->term[F_PRES] = calc_pres(fr->pbcType, inputrec->nwall, ems->s.box, ekin, vir, pres);
-
-    if (inputrec->efep != efepNO)
-    {
-        accumulateKineticLambdaComponents(enerd, ems->s.lambda, *inputrec->fepvals);
-    }
-
-    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
-    {
-        get_state_f_norm_max(cr, &(inputrec->opts), mdAtoms->mdatoms(), ems);
-    }
-}
-
-} // namespace
-
-//! Parallel utility summing energies and forces
-static double reorder_partsum(const t_commrec*  cr,
-                              t_grpopts*        opts,
-                              const gmx_mtop_t* top_global,
-                              const em_state_t* s_min,
-                              const em_state_t* s_b)
-{
-    if (debug)
-    {
-        fprintf(debug, "Doing reorder_partsum\n");
-    }
-
-    auto fm = s_min->f.view().force();
-    auto fb = s_b->f.view().force();
-
-    /* Collect fm in a global vector fmg.
-     * This conflicts with the spirit of domain decomposition,
-     * but to fully optimize this a much more complicated algorithm is required.
-     */
-    const int natoms = top_global->natoms;
-    rvec*     fmg;
-    snew(fmg, natoms);
-
-    gmx::ArrayRef<const int> indicesMin = s_min->s.cg_gl;
-    int                      i          = 0;
-    for (int a : indicesMin)
-    {
-        copy_rvec(fm[i], fmg[a]);
-        i++;
-    }
-    gmx_sum(top_global->natoms * 3, fmg[0], cr);
-
-    /* Now we will determine the part of the sum for the cgs in state s_b */
-    gmx::ArrayRef<const int> indicesB = s_b->s.cg_gl;
-
-    double partsum                        = 0;
-    i                                     = 0;
-    int                                gf = 0;
-    gmx::ArrayRef<const unsigned char> grpnrFREEZE =
-            top_global->groups.groupNumbers[SimulationAtomGroupType::Freeze];
-    for (int a : indicesB)
-    {
-        if (!grpnrFREEZE.empty())
-        {
-            gf = grpnrFREEZE[i];
-        }
-        for (int m = 0; m < DIM; m++)
-        {
-            if (!opts->nFreeze[gf][m])
-            {
-                partsum += (fb[i][m] - fmg[a][m]) * fb[i][m];
-            }
-        }
-        i++;
-    }
-
-    sfree(fmg);
-
-    return partsum;
-}
-
-//! Print some stuff, like beta, whatever that means.
-static real pr_beta(const t_commrec*  cr,
-                    t_grpopts*        opts,
-                    t_mdatoms*        mdatoms,
-                    const gmx_mtop_t* top_global,
-                    const em_state_t* s_min,
-                    const em_state_t* s_b)
-{
-    double sum;
-
-    /* This is just the classical Polak-Ribiere calculation of beta;
-     * it looks a bit complicated since we take freeze groups into account,
-     * and might have to sum it in parallel runs.
-     */
-
-    if (!DOMAINDECOMP(cr)
-        || (s_min->s.ddp_count == cr->dd->ddp_count && s_b->s.ddp_count == cr->dd->ddp_count))
-    {
-        auto fm = s_min->f.view().force();
-        auto fb = s_b->f.view().force();
-        sum     = 0;
-        int gf  = 0;
-        /* This part of code can be incorrect with DD,
-         * since the atom ordering in s_b and s_min might differ.
-         */
-        for (int i = 0; i < mdatoms->homenr; i++)
-        {
-            if (mdatoms->cFREEZE)
-            {
-                gf = mdatoms->cFREEZE[i];
-            }
-            for (int m = 0; m < DIM; m++)
-            {
-                if (!opts->nFreeze[gf][m])
-                {
-                    sum += (fb[i][m] - fm[i][m]) * fb[i][m];
-                }
-            }
-        }
-    }
-    else
-    {
-        /* We need to reorder cgs while summing */
-        sum = reorder_partsum(cr, opts, top_global, s_min, s_b);
-    }
-    if (PAR(cr))
-    {
-        gmx_sumd(1, &sum, cr);
-    }
-
-    return sum / gmx::square(s_min->fnorm);
-}
-
-namespace gmx
-{
-
-void LegacySimulator::do_cg()
-{
-    const char* CG = "Polak-Ribiere Conjugate Gradients";
-
-    gmx_localtop_t    top(top_global->ffparams);
-    gmx_global_stat_t gstat;
-    double            tmp, minstep;
-    real              stepsize;
-    real              a, b, c, beta = 0.0;
-    real              epot_repl = 0;
-    real              pnorm;
-    gmx_bool          converged, foundlower;
-    rvec              mu_tot = { 0 };
-    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
-    tensor            vir, pres;
-    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
-    int               m, step, nminstep;
-    auto              mdatoms = mdAtoms->mdatoms();
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that activating conjugate gradient energy minimization via the "
-                    "integrator .mdp option and the command gmx mdrun may "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx minimize and an .mdp option.");
-
-    step = 0;
-
-    if (MASTER(cr))
-    {
-        // In CG, the state is extended with a search direction
-        state_global->flags |= (1 << estCGP);
-
-        // Ensure the extra per-atom state array gets allocated
-        state_change_natoms(state_global, state_global->natoms);
-
-        // Initialize the search direction to zero
-        for (RVec& cg_p : state_global->cg_p)
-        {
-            cg_p = { 0, 0, 0 };
-        }
-    }
-
-    /* Create 4 states on the stack and extract pointers that we will swap */
-    em_state_t  s0{}, s1{}, s2{}, s3{};
-    em_state_t* s_min = &s0;
-    em_state_t* s_a   = &s1;
-    em_state_t* s_b   = &s2;
-    em_state_t* s_c   = &s3;
-
-    /* Init em and store the local state in s_min */
-    init_em(fplog, mdlog, CG, cr, inputrec, imdSession, pull_work, state_global, top_global, s_min,
-            &top, nrnb, fr, mdAtoms, &gstat, vsite, constr, nullptr);
-    const bool        simulationsShareState = false;
-    gmx_mdoutf*       outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work,
-                                   nullptr, false, StartingBehavior::NewSimulation,
-                                   simulationsShareState, mdModulesNotifier);
-
-    /* Print to log file */
-    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
-
-    /* Max number of steps */
-    number_steps = inputrec->nsteps;
-
-    if (MASTER(cr))
-    {
-        sp_header(stderr, CG, inputrec->em_tol, number_steps);
-    }
-    if (fplog)
-    {
-        sp_header(fplog, CG, inputrec->em_tol, number_steps);
-    }
-
-    EnergyEvaluator energyEvaluator{ fplog,    mdlog,      cr,        ms,   top_global,      &top,
-                                     inputrec, imdSession, pull_work, nrnb, wcycle,          gstat,
-                                     vsite,    constr,     mdAtoms,   fr,   runScheduleWork, enerd };
-    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-    /* do_force always puts the charge groups in the box and shifts again
-     * We do not unshift, so molecules are always whole in congrad.c
-     */
-    energyEvaluator.run(s_min, mu_tot, vir, pres, -1, TRUE);
-
-    if (MASTER(cr))
-    {
-        /* Copy stuff to the energy bin for easy printing etc. */
-        matrix nullBox = {};
-        energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(step), mdatoms->tmass,
-                                         enerd, nullptr, nullptr, nullBox, PTCouplingArrays(), 0,
-                                         nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
-
-        EnergyOutput::printHeader(fplog, step, step);
-        energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step,
-                                           step, fr->fcdata.get(), nullptr);
-    }
-
-    /* Estimate/guess the initial stepsize */
-    stepsize = inputrec->em_stepsize / s_min->fnorm;
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", s_min->fmax, s_min->a_fmax + 1);
-        fprintf(stderr, "   F-Norm            = %12.5e\n", s_min->fnorm / sqrtNumAtoms);
-        fprintf(stderr, "\n");
-        /* and copy to the log file too... */
-        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", s_min->fmax, s_min->a_fmax + 1);
-        fprintf(fplog, "   F-Norm            = %12.5e\n", s_min->fnorm / sqrtNumAtoms);
-        fprintf(fplog, "\n");
-    }
-    /* Start the loop over CG steps.
-     * Each successful step is counted, and we continue until
-     * we either converge or reach the max number of steps.
-     */
-    converged = FALSE;
-    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
-    {
-
-        /* start taking steps in a new direction
-         * First time we enter the routine, beta=0, and the direction is
-         * simply the negative gradient.
-         */
-
-        /* Calculate the new direction in p, and the gradient in this direction, gpa */
-        gmx::ArrayRef<gmx::RVec>       pm  = s_min->s.cg_p;
-        gmx::ArrayRef<const gmx::RVec> sfm = s_min->f.view().force();
-        double                         gpa = 0;
-        int                            gf  = 0;
-        for (int i = 0; i < mdatoms->homenr; i++)
-        {
-            if (mdatoms->cFREEZE)
-            {
-                gf = mdatoms->cFREEZE[i];
-            }
-            for (m = 0; m < DIM; m++)
-            {
-                if (!inputrec->opts.nFreeze[gf][m])
-                {
-                    pm[i][m] = sfm[i][m] + beta * pm[i][m];
-                    gpa -= pm[i][m] * sfm[i][m];
-                    /* f is negative gradient, thus the sign */
-                }
-                else
-                {
-                    pm[i][m] = 0;
-                }
-            }
-        }
-
-        /* Sum the gradient along the line across CPUs */
-        if (PAR(cr))
-        {
-            gmx_sumd(1, &gpa, cr);
-        }
-
-        /* Calculate the norm of the search vector */
-        get_f_norm_max(cr, &(inputrec->opts), mdatoms, pm, &pnorm, nullptr, nullptr);
-
-        /* Just in case stepsize reaches zero due to numerical precision... */
-        if (stepsize <= 0)
-        {
-            stepsize = inputrec->em_stepsize / pnorm;
-        }
-
-        /*
-         * Double check the value of the derivative in the search direction.
-         * If it is positive it must be due to the old information in the
-         * CG formula, so just remove that and start over with beta=0.
-         * This corresponds to a steepest descent step.
-         */
-        if (gpa > 0)
-        {
-            beta = 0;
-            step--;   /* Don't count this step since we are restarting */
-            continue; /* Go back to the beginning of the big for-loop */
-        }
-
-        /* Calculate minimum allowed stepsize, before the average (norm)
-         * relative change in coordinate is smaller than precision
-         */
-        minstep      = 0;
-        auto s_min_x = makeArrayRef(s_min->s.x);
-        for (int i = 0; i < mdatoms->homenr; i++)
-        {
-            for (m = 0; m < DIM; m++)
-            {
-                tmp = fabs(s_min_x[i][m]);
-                if (tmp < 1.0)
-                {
-                    tmp = 1.0;
-                }
-                tmp = pm[i][m] / tmp;
-                minstep += tmp * tmp;
-            }
-        }
-        /* Add up from all CPUs */
-        if (PAR(cr))
-        {
-            gmx_sumd(1, &minstep, cr);
-        }
-
-        minstep = GMX_REAL_EPS / sqrt(minstep / (3 * top_global->natoms));
-
-        if (stepsize < minstep)
-        {
-            converged = TRUE;
-            break;
-        }
-
-        /* Write coordinates if necessary */
-        do_x = do_per_step(step, inputrec->nstxout);
-        do_f = do_per_step(step, inputrec->nstfout);
-
-        write_em_traj(fplog, cr, outf, do_x, do_f, nullptr, top_global, inputrec, step, s_min,
-                      state_global, observablesHistory);
-
-        /* Take a step downhill.
-         * In theory, we should minimize the function along this direction.
-         * That is quite possible, but it turns out to take 5-10 function evaluations
-         * for each line. However, we dont really need to find the exact minimum -
-         * it is much better to start a new CG step in a modified direction as soon
-         * as we are close to it. This will save a lot of energy evaluations.
-         *
-         * In practice, we just try to take a single step.
-         * If it worked (i.e. lowered the energy), we increase the stepsize but
-         * the continue straight to the next CG step without trying to find any minimum.
-         * If it didn't work (higher energy), there must be a minimum somewhere between
-         * the old position and the new one.
-         *
-         * Due to the finite numerical accuracy, it turns out that it is a good idea
-         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-         * This leads to lower final energies in the tests I've done. / Erik
-         */
-        s_a->epot = s_min->epot;
-        a         = 0.0;
-        c         = a + stepsize; /* reference position along line is zero */
-
-        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
-        {
-            em_dd_partition_system(fplog, mdlog, step, cr, top_global, inputrec, imdSession,
-                                   pull_work, s_min, &top, mdAtoms, fr, vsite, constr, nrnb, wcycle);
-        }
-
-        /* Take a trial step (new coords in s_c) */
-        do_em_step(cr, inputrec, mdatoms, s_min, c, s_min->s.cg_p.constArrayRefWithPadding(), s_c,
-                   constr, -1);
-
-        neval++;
-        /* Calculate energy for the trial step */
-        energyEvaluator.run(s_c, mu_tot, vir, pres, -1, FALSE);
-
-        /* Calc derivative along line */
-        const rvec*                    pc  = s_c->s.cg_p.rvec_array();
-        gmx::ArrayRef<const gmx::RVec> sfc = s_c->f.view().force();
-        double                         gpc = 0;
-        for (int i = 0; i < mdatoms->homenr; i++)
-        {
-            for (m = 0; m < DIM; m++)
-            {
-                gpc -= pc[i][m] * sfc[i][m]; /* f is negative gradient, thus the sign */
-            }
-        }
-        /* Sum the gradient along the line across CPUs */
-        if (PAR(cr))
-        {
-            gmx_sumd(1, &gpc, cr);
-        }
-
-        /* This is the max amount of increase in energy we tolerate */
-        tmp = std::sqrt(GMX_REAL_EPS) * fabs(s_a->epot);
-
-        /* Accept the step if the energy is lower, or if it is not significantly higher
-         * and the line derivative is still negative.
-         */
-        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
-        {
-            foundlower = TRUE;
-            /* Great, we found a better energy. Increase step for next iteration
-             * if we are still going down, decrease it otherwise
-             */
-            if (gpc < 0)
-            {
-                stepsize *= 1.618034; /* The golden section */
-            }
-            else
-            {
-                stepsize *= 0.618034; /* 1/golden section */
-            }
-        }
-        else
-        {
-            /* New energy is the same or higher. We will have to do some work
-             * to find a smaller value in the interval. Take smaller step next time!
-             */
-            foundlower = FALSE;
-            stepsize *= 0.618034;
-        }
-
-
-        /* OK, if we didn't find a lower value we will have to locate one now - there must
-         * be one in the interval [a=0,c].
-         * The same thing is valid here, though: Don't spend dozens of iterations to find
-         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-         *
-         * I also have a safeguard for potentially really pathological functions so we never
-         * take more than 20 steps before we give up ...
-         *
-         * If we already found a lower value we just skip this step and continue to the update.
-         */
-        double gpb;
-        if (!foundlower)
-        {
-            nminstep = 0;
-
-            do
-            {
-                /* Select a new trial point.
-                 * If the derivatives at points a & c have different sign we interpolate to zero,
-                 * otherwise just do a bisection.
-                 */
-                if (gpa < 0 && gpc > 0)
-                {
-                    b = a + gpa * (a - c) / (gpc - gpa);
-                }
-                else
-                {
-                    b = 0.5 * (a + c);
-                }
-
-                /* safeguard if interpolation close to machine accuracy causes errors:
-                 * never go outside the interval
-                 */
-                if (b <= a || b >= c)
-                {
-                    b = 0.5 * (a + c);
-                }
-
-                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-                {
-                    /* Reload the old state */
-                    em_dd_partition_system(fplog, mdlog, -1, cr, top_global, inputrec, imdSession, pull_work,
-                                           s_min, &top, mdAtoms, fr, vsite, constr, nrnb, wcycle);
-                }
-
-                /* Take a trial step to this new point - new coords in s_b */
-                do_em_step(cr, inputrec, mdatoms, s_min, b,
-                           s_min->s.cg_p.constArrayRefWithPadding(), s_b, constr, -1);
-
-                neval++;
-                /* Calculate energy for the trial step */
-                energyEvaluator.run(s_b, mu_tot, vir, pres, -1, FALSE);
-
-                /* p does not change within a step, but since the domain decomposition
-                 * might change, we have to use cg_p of s_b here.
-                 */
-                const rvec*                    pb  = s_b->s.cg_p.rvec_array();
-                gmx::ArrayRef<const gmx::RVec> sfb = s_b->f.view().force();
-                gpb                                = 0;
-                for (int i = 0; i < mdatoms->homenr; i++)
-                {
-                    for (m = 0; m < DIM; m++)
-                    {
-                        gpb -= pb[i][m] * sfb[i][m]; /* f is negative gradient, thus the sign */
-                    }
-                }
-                /* Sum the gradient along the line across CPUs */
-                if (PAR(cr))
-                {
-                    gmx_sumd(1, &gpb, cr);
-                }
-
-                if (debug)
-                {
-                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n", s_a->epot, s_b->epot,
-                            s_c->epot, gpb);
-                }
-
-                epot_repl = s_b->epot;
-
-                /* Keep one of the intervals based on the value of the derivative at the new point */
-                if (gpb > 0)
-                {
-                    /* Replace c endpoint with b */
-                    swap_em_state(&s_b, &s_c);
-                    c   = b;
-                    gpc = gpb;
-                }
-                else
-                {
-                    /* Replace a endpoint with b */
-                    swap_em_state(&s_b, &s_a);
-                    a   = b;
-                    gpa = gpb;
-                }
-
-                /*
-                 * Stop search as soon as we find a value smaller than the endpoints.
-                 * Never run more than 20 steps, no matter what.
-                 */
-                nminstep++;
-            } while ((epot_repl > s_a->epot || epot_repl > s_c->epot) && (nminstep < 20));
-
-            if (std::fabs(epot_repl - s_min->epot) < fabs(s_min->epot) * GMX_REAL_EPS || nminstep >= 20)
-            {
-                /* OK. We couldn't find a significantly lower energy.
-                 * If beta==0 this was steepest descent, and then we give up.
-                 * If not, set beta=0 and restart with steepest descent before quitting.
-                 */
-                if (beta == 0.0)
-                {
-                    /* Converged */
-                    converged = TRUE;
-                    break;
-                }
-                else
-                {
-                    /* Reset memory before giving up */
-                    beta = 0.0;
-                    continue;
-                }
-            }
-
-            /* Select min energy state of A & C, put the best in B.
-             */
-            if (s_c->epot < s_a->epot)
-            {
-                if (debug)
-                {
-                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n", s_c->epot,
-                            s_a->epot);
-                }
-                swap_em_state(&s_b, &s_c);
-                gpb = gpc;
-            }
-            else
-            {
-                if (debug)
-                {
-                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n", s_a->epot,
-                            s_c->epot);
-                }
-                swap_em_state(&s_b, &s_a);
-                gpb = gpa;
-            }
-        }
-        else
-        {
-            if (debug)
-            {
-                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n", s_c->epot);
-            }
-            swap_em_state(&s_b, &s_c);
-            gpb = gpc;
-        }
-
-        /* new search direction */
-        /* beta = 0 means forget all memory and restart with steepest descents. */
-        if (nstcg && ((step % nstcg) == 0))
-        {
-            beta = 0.0;
-        }
-        else
-        {
-            /* s_min->fnorm cannot be zero, because then we would have converged
-             * and broken out.
-             */
-
-            /* Polak-Ribiere update.
-             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
-             */
-            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
-        }
-        /* Limit beta to prevent oscillations */
-        if (fabs(beta) > 5.0)
-        {
-            beta = 0.0;
-        }
-
-
-        /* update positions */
-        swap_em_state(&s_min, &s_b);
-        gpa = gpb;
-
-        /* Print it if necessary */
-        if (MASTER(cr))
-        {
-            if (mdrunOptions.verbose)
-            {
-                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n", step,
-                        s_min->epot, s_min->fnorm / sqrtNumAtoms, s_min->fmax, s_min->a_fmax + 1);
-                fflush(stderr);
-            }
-            /* Store the new (lower) energies */
-            matrix nullBox = {};
-            energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(step), mdatoms->tmass,
-                                             enerd, nullptr, nullptr, nullBox, PTCouplingArrays(), 0,
-                                             nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
-
-            do_log = do_per_step(step, inputrec->nstlog);
-            do_ene = do_per_step(step, inputrec->nstenergy);
-
-            imdSession->fillEnergyRecord(step, TRUE);
-
-            if (do_log)
-            {
-                EnergyOutput::printHeader(fplog, step, step);
-            }
-            energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-                                               do_log ? fplog : nullptr, step, step,
-                                               fr->fcdata.get(), nullptr);
-        }
-
-        /* Send energies and positions to the IMD client if bIMD is TRUE. */
-        if (MASTER(cr) && imdSession->run(step, TRUE, state_global->box, state_global->x.rvec_array(), 0))
-        {
-            imdSession->sendPositionsAndEnergies();
-        }
-
-        /* Stop when the maximum force lies below tolerance.
-         * If we have reached machine precision, converged is already set to true.
-         */
-        converged = converged || (s_min->fmax < inputrec->em_tol);
-
-    } /* End of the loop */
-
-    if (converged)
-    {
-        step--; /* we never took that last step in this case */
-    }
-    if (s_min->fmax > inputrec->em_tol)
-    {
-        if (MASTER(cr))
-        {
-            warn_step(fplog, inputrec->em_tol, s_min->fmax, step - 1 == number_steps, FALSE);
-        }
-        converged = FALSE;
-    }
-
-    if (MASTER(cr))
-    {
-        /* If we printed energy and/or logfile last step (which was the last step)
-         * we don't have to do it again, but otherwise print the final values.
-         */
-        if (!do_log)
-        {
-            /* Write final value to log since we didn't do anything the last step */
-            EnergyOutput::printHeader(fplog, step, step);
-        }
-        if (!do_ene || !do_log)
-        {
-            /* Write final energy file entries */
-            energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-                                               !do_log ? fplog : nullptr, step, step,
-                                               fr->fcdata.get(), nullptr);
-        }
-    }
-
-    /* Print some stuff... */
-    if (MASTER(cr))
-    {
-        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-    }
-
-    /* IMPORTANT!
-     * For accurate normal mode calculation it is imperative that we
-     * store the last conformation into the full precision binary trajectory.
-     *
-     * However, we should only do it if we did NOT already write this step
-     * above (which we did if do_x or do_f was true).
-     */
-    /* Note that with 0 < nstfout != nstxout we can end up with two frames
-     * in the trajectory with the same step number.
-     */
-    do_x = !do_per_step(step, inputrec->nstxout);
-    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
-
-    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm), top_global, inputrec,
-                  step, s_min, state_global, observablesHistory);
-
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps, s_min, sqrtNumAtoms);
-        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps, s_min, sqrtNumAtoms);
-
-        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-    }
-
-    finish_em(cr, outf, walltime_accounting, wcycle);
-
-    /* To print the actual number of steps we needed somewhere */
-    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-}
-
-
-void LegacySimulator::do_lbfgs()
-{
-    static const char* LBFGS = "Low-Memory BFGS Minimizer";
-    em_state_t         ems;
-    gmx_localtop_t     top(top_global->ffparams);
-    gmx_global_stat_t  gstat;
-    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
-    double             stepsize, step_taken, gpa, gpb, gpc, tmp, minstep;
-    real *             rho, *alpha, *p, *s, **dx, **dg;
-    real               a, b, c, maxdelta, delta;
-    real               diag, Epot0;
-    real               dgdx, dgdg, sq, yr, beta;
-    gmx_bool           converged;
-    rvec               mu_tot = { 0 };
-    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
-    tensor             vir, pres;
-    int                start, end, number_steps;
-    int                i, k, m, n, gf, step;
-    int                mdof_flags;
-    auto               mdatoms = mdAtoms->mdatoms();
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that activating L-BFGS energy minimization via the "
-                    "integrator .mdp option and the command gmx mdrun may "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx minimize and an .mdp option.");
-
-    if (PAR(cr))
-    {
-        gmx_fatal(FARGS, "L-BFGS minimization only supports a single rank");
-    }
-
-    if (nullptr != constr)
-    {
-        gmx_fatal(
-                FARGS,
-                "The combination of constraints and L-BFGS minimization is not implemented. Either "
-                "do not use constraints, or use another minimizer (e.g. steepest descent).");
-    }
-
-    n        = 3 * state_global->natoms;
-    nmaxcorr = inputrec->nbfgscorr;
-
-    snew(frozen, n);
-
-    snew(p, n);
-    snew(rho, nmaxcorr);
-    snew(alpha, nmaxcorr);
-
-    snew(dx, nmaxcorr);
-    for (i = 0; i < nmaxcorr; i++)
-    {
-        snew(dx[i], n);
-    }
-
-    snew(dg, nmaxcorr);
-    for (i = 0; i < nmaxcorr; i++)
-    {
-        snew(dg[i], n);
-    }
-
-    step  = 0;
-    neval = 0;
-
-    /* Init em */
-    init_em(fplog, mdlog, LBFGS, cr, inputrec, imdSession, pull_work, state_global, top_global,
-            &ems, &top, nrnb, fr, mdAtoms, &gstat, vsite, constr, nullptr);
-    const bool        simulationsShareState = false;
-    gmx_mdoutf*       outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work,
-                                   nullptr, false, StartingBehavior::NewSimulation,
-                                   simulationsShareState, mdModulesNotifier);
-
-    start = 0;
-    end   = mdatoms->homenr;
-
-    /* We need 4 working states */
-    em_state_t  s0{}, s1{}, s2{}, s3{};
-    em_state_t* sa   = &s0;
-    em_state_t* sb   = &s1;
-    em_state_t* sc   = &s2;
-    em_state_t* last = &s3;
-    /* Initialize by copying the state from ems (we could skip x and f here) */
-    *sa = ems;
-    *sb = ems;
-    *sc = ems;
-
-    /* Print to log file */
-    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
-
-    do_log = do_ene = do_x = do_f = TRUE;
-
-    /* Max number of steps */
-    number_steps = inputrec->nsteps;
-
-    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
-    gf = 0;
-    for (i = start; i < end; i++)
-    {
-        if (mdatoms->cFREEZE)
-        {
-            gf = mdatoms->cFREEZE[i];
-        }
-        for (m = 0; m < DIM; m++)
-        {
-            frozen[3 * i + m] = (inputrec->opts.nFreeze[gf][m] != 0);
-        }
-    }
-    if (MASTER(cr))
-    {
-        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
-    }
-    if (fplog)
-    {
-        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
-    }
-
-    if (vsite)
-    {
-        vsite->construct(state_global->x, 1, {}, state_global->box);
-    }
-
-    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-    /* do_force always puts the charge groups in the box and shifts again
-     * We do not unshift, so molecules are always whole
-     */
-    neval++;
-    EnergyEvaluator energyEvaluator{ fplog,    mdlog,      cr,        ms,   top_global,      &top,
-                                     inputrec, imdSession, pull_work, nrnb, wcycle,          gstat,
-                                     vsite,    constr,     mdAtoms,   fr,   runScheduleWork, enerd };
-    energyEvaluator.run(&ems, mu_tot, vir, pres, -1, TRUE);
-
-    if (MASTER(cr))
-    {
-        /* Copy stuff to the energy bin for easy printing etc. */
-        matrix nullBox = {};
-        energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(step), mdatoms->tmass,
-                                         enerd, nullptr, nullptr, nullBox, PTCouplingArrays(), 0,
-                                         nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
-
-        EnergyOutput::printHeader(fplog, step, step);
-        energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step,
-                                           step, fr->fcdata.get(), nullptr);
-    }
-
-    /* Set the initial step.
-     * since it will be multiplied by the non-normalized search direction
-     * vector (force vector the first time), we scale it by the
-     * norm of the force.
-     */
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
-        fprintf(stderr, "   F-Norm            = %12.5e\n", ems.fnorm / sqrtNumAtoms);
-        fprintf(stderr, "\n");
-        /* and copy to the log file too... */
-        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1);
-        fprintf(fplog, "   F-Norm            = %12.5e\n", ems.fnorm / sqrtNumAtoms);
-        fprintf(fplog, "\n");
-    }
-
-    // Point is an index to the memory of search directions, where 0 is the first one.
-    point = 0;
-
-    // Set initial search direction to the force (-gradient), or 0 for frozen particles.
-    real* fInit = static_cast<real*>(ems.f.view().force().data()[0]);
-    for (i = 0; i < n; i++)
-    {
-        if (!frozen[i])
-        {
-            dx[point][i] = fInit[i]; /* Initial search direction */
-        }
-        else
-        {
-            dx[point][i] = 0;
-        }
-    }
-
-    // Stepsize will be modified during the search, and actually it is not critical
-    // (the main efficiency in the algorithm comes from changing directions), but
-    // we still need an initial value, so estimate it as the inverse of the norm
-    // so we take small steps where the potential fluctuates a lot.
-    stepsize = 1.0 / ems.fnorm;
-
-    /* Start the loop over BFGS steps.
-     * Each successful step is counted, and we continue until
-     * we either converge or reach the max number of steps.
-     */
-
-    ncorr = 0;
-
-    /* Set the gradient from the force */
-    converged = FALSE;
-    for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++)
-    {
-
-        /* Write coordinates if necessary */
-        do_x = do_per_step(step, inputrec->nstxout);
-        do_f = do_per_step(step, inputrec->nstfout);
-
-        mdof_flags = 0;
-        if (do_x)
-        {
-            mdof_flags |= MDOF_X;
-        }
-
-        if (do_f)
-        {
-            mdof_flags |= MDOF_F;
-        }
-
-        if (inputrec->bIMD)
-        {
-            mdof_flags |= MDOF_IMD;
-        }
-
-        gmx::WriteCheckpointDataHolder checkpointDataHolder;
-        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags, top_global->natoms, step,
-                                         static_cast<real>(step), &ems.s, state_global, observablesHistory,
-                                         ems.f.view().force(), &checkpointDataHolder);
-
-        /* Do the linesearching in the direction dx[point][0..(n-1)] */
-
-        /* make s a pointer to current search direction - point=0 first time we get here */
-        s = dx[point];
-
-        real* xx = static_cast<real*>(ems.s.x.rvec_array()[0]);
-        real* ff = static_cast<real*>(ems.f.view().force().data()[0]);
-
-        // calculate line gradient in position A
-        for (gpa = 0, i = 0; i < n; i++)
-        {
-            gpa -= s[i] * ff[i];
-        }
-
-        /* Calculate minimum allowed stepsize along the line, before the average (norm)
-         * relative change in coordinate is smaller than precision
-         */
-        for (minstep = 0, i = 0; i < n; i++)
-        {
-            tmp = fabs(xx[i]);
-            if (tmp < 1.0)
-            {
-                tmp = 1.0;
-            }
-            tmp = s[i] / tmp;
-            minstep += tmp * tmp;
-        }
-        minstep = GMX_REAL_EPS / sqrt(minstep / n);
-
-        if (stepsize < minstep)
-        {
-            converged = TRUE;
-            break;
-        }
-
-        // Before taking any steps along the line, store the old position
-        *last       = ems;
-        real* lastx = static_cast<real*>(last->s.x.data()[0]);
-        real* lastf = static_cast<real*>(last->f.view().force().data()[0]);
-        Epot0       = ems.epot;
-
-        *sa = ems;
-
-        /* Take a step downhill.
-         * In theory, we should find the actual minimum of the function in this
-         * direction, somewhere along the line.
-         * That is quite possible, but it turns out to take 5-10 function evaluations
-         * for each line. However, we dont really need to find the exact minimum -
-         * it is much better to start a new BFGS step in a modified direction as soon
-         * as we are close to it. This will save a lot of energy evaluations.
-         *
-         * In practice, we just try to take a single step.
-         * If it worked (i.e. lowered the energy), we increase the stepsize but
-         * continue straight to the next BFGS step without trying to find any minimum,
-         * i.e. we change the search direction too. If the line was smooth, it is
-         * likely we are in a smooth region, and then it makes sense to take longer
-         * steps in the modified search direction too.
-         *
-         * If it didn't work (higher energy), there must be a minimum somewhere between
-         * the old position and the new one. Then we need to start by finding a lower
-         * value before we change search direction. Since the energy was apparently
-         * quite rough, we need to decrease the step size.
-         *
-         * Due to the finite numerical accuracy, it turns out that it is a good idea
-         * to accept a SMALL increase in energy, if the derivative is still downhill.
-         * This leads to lower final energies in the tests I've done. / Erik
-         */
-
-        // State "A" is the first position along the line.
-        // reference position along line is initially zero
-        a = 0.0;
-
-        // Check stepsize first. We do not allow displacements
-        // larger than emstep.
-        //
-        do
-        {
-            // Pick a new position C by adding stepsize to A.
-            c = a + stepsize;
-
-            // Calculate what the largest change in any individual coordinate
-            // would be (translation along line * gradient along line)
-            maxdelta = 0;
-            for (i = 0; i < n; i++)
-            {
-                delta = c * s[i];
-                if (delta > maxdelta)
-                {
-                    maxdelta = delta;
-                }
-            }
-            // If any displacement is larger than the stepsize limit, reduce the step
-            if (maxdelta > inputrec->em_stepsize)
-            {
-                stepsize *= 0.1;
-            }
-        } while (maxdelta > inputrec->em_stepsize);
-
-        // Take a trial step and move the coordinate array xc[] to position C
-        real* xc = static_cast<real*>(sc->s.x.rvec_array()[0]);
-        for (i = 0; i < n; i++)
-        {
-            xc[i] = lastx[i] + c * s[i];
-        }
-
-        neval++;
-        // Calculate energy for the trial step in position C
-        energyEvaluator.run(sc, mu_tot, vir, pres, step, FALSE);
-
-        // Calc line gradient in position C
-        real* fc = static_cast<real*>(sc->f.view().force()[0]);
-        for (gpc = 0, i = 0; i < n; i++)
-        {
-            gpc -= s[i] * fc[i]; /* f is negative gradient, thus the sign */
-        }
-        /* Sum the gradient along the line across CPUs */
-        if (PAR(cr))
-        {
-            gmx_sumd(1, &gpc, cr);
-        }
-
-        // This is the max amount of increase in energy we tolerate.
-        // By allowing VERY small changes (close to numerical precision) we
-        // frequently find even better (lower) final energies.
-        tmp = std::sqrt(GMX_REAL_EPS) * fabs(sa->epot);
-
-        // Accept the step if the energy is lower in the new position C (compared to A),
-        // or if it is not significantly higher and the line derivative is still negative.
-        foundlower = sc->epot < sa->epot || (gpc < 0 && sc->epot < (sa->epot + tmp));
-        // If true, great, we found a better energy. We no longer try to alter the
-        // stepsize, but simply accept this new better position. The we select a new
-        // search direction instead, which will be much more efficient than continuing
-        // to take smaller steps along a line. Set fnorm based on the new C position,
-        // which will be used to update the stepsize to 1/fnorm further down.
-
-        // If false, the energy is NOT lower in point C, i.e. it will be the same
-        // or higher than in point A. In this case it is pointless to move to point C,
-        // so we will have to do more iterations along the same line to find a smaller
-        // value in the interval [A=0.0,C].
-        // Here, A is still 0.0, but that will change when we do a search in the interval
-        // [0.0,C] below. That search we will do by interpolation or bisection rather
-        // than with the stepsize, so no need to modify it. For the next search direction
-        // it will be reset to 1/fnorm anyway.
-
-        if (!foundlower)
-        {
-            // OK, if we didn't find a lower value we will have to locate one now - there must
-            // be one in the interval [a,c].
-            // The same thing is valid here, though: Don't spend dozens of iterations to find
-            // the line minimum. We try to interpolate based on the derivative at the endpoints,
-            // and only continue until we find a lower value. In most cases this means 1-2 iterations.
-            // I also have a safeguard for potentially really pathological functions so we never
-            // take more than 20 steps before we give up.
-            // If we already found a lower value we just skip this step and continue to the update.
-            real fnorm = 0;
-            nminstep   = 0;
-            do
-            {
-                // Select a new trial point B in the interval [A,C].
-                // If the derivatives at points a & c have different sign we interpolate to zero,
-                // otherwise just do a bisection since there might be multiple minima/maxima
-                // inside the interval.
-                if (gpa < 0 && gpc > 0)
-                {
-                    b = a + gpa * (a - c) / (gpc - gpa);
-                }
-                else
-                {
-                    b = 0.5 * (a + c);
-                }
-
-                /* safeguard if interpolation close to machine accuracy causes errors:
-                 * never go outside the interval
-                 */
-                if (b <= a || b >= c)
-                {
-                    b = 0.5 * (a + c);
-                }
-
-                // Take a trial step to point B
-                real* xb = static_cast<real*>(sb->s.x.rvec_array()[0]);
-                for (i = 0; i < n; i++)
-                {
-                    xb[i] = lastx[i] + b * s[i];
-                }
-
-                neval++;
-                // Calculate energy for the trial step in point B
-                energyEvaluator.run(sb, mu_tot, vir, pres, step, FALSE);
-                fnorm = sb->fnorm;
-
-                // Calculate gradient in point B
-                real* fb = static_cast<real*>(sb->f.view().force()[0]);
-                for (gpb = 0, i = 0; i < n; i++)
-                {
-                    gpb -= s[i] * fb[i]; /* f is negative gradient, thus the sign */
-                }
-                /* Sum the gradient along the line across CPUs */
-                if (PAR(cr))
-                {
-                    gmx_sumd(1, &gpb, cr);
-                }
-
-                // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative
-                // at the new point B, and rename the endpoints of this new interval A and C.
-                if (gpb > 0)
-                {
-                    /* Replace c endpoint with b */
-                    c = b;
-                    /* copy state b to c */
-                    *sc = *sb;
-                }
-                else
-                {
-                    /* Replace a endpoint with b */
-                    a = b;
-                    /* copy state b to a */
-                    *sa = *sb;
-                }
-
-                /*
-                 * Stop search as soon as we find a value smaller than the endpoints,
-                 * or if the tolerance is below machine precision.
-                 * Never run more than 20 steps, no matter what.
-                 */
-                nminstep++;
-            } while ((sb->epot > sa->epot || sb->epot > sc->epot) && (nminstep < 20));
-
-            if (std::fabs(sb->epot - Epot0) < GMX_REAL_EPS || nminstep >= 20)
-            {
-                /* OK. We couldn't find a significantly lower energy.
-                 * If ncorr==0 this was steepest descent, and then we give up.
-                 * If not, reset memory to restart as steepest descent before quitting.
-                 */
-                if (ncorr == 0)
-                {
-                    /* Converged */
-                    converged = TRUE;
-                    break;
-                }
-                else
-                {
-                    /* Reset memory */
-                    ncorr = 0;
-                    /* Search in gradient direction */
-                    for (i = 0; i < n; i++)
-                    {
-                        dx[point][i] = ff[i];
-                    }
-                    /* Reset stepsize */
-                    stepsize = 1.0 / fnorm;
-                    continue;
-                }
-            }
-
-            /* Select min energy state of A & C, put the best in xx/ff/Epot
-             */
-            if (sc->epot < sa->epot)
-            {
-                /* Use state C */
-                ems        = *sc;
-                step_taken = c;
-            }
-            else
-            {
-                /* Use state A */
-                ems        = *sa;
-                step_taken = a;
-            }
-        }
-        else
-        {
-            /* found lower */
-            /* Use state C */
-            ems        = *sc;
-            step_taken = c;
-        }
-
-        /* Update the memory information, and calculate a new
-         * approximation of the inverse hessian
-         */
-
-        /* Have new data in Epot, xx, ff */
-        if (ncorr < nmaxcorr)
-        {
-            ncorr++;
-        }
-
-        for (i = 0; i < n; i++)
-        {
-            dg[point][i] = lastf[i] - ff[i];
-            dx[point][i] *= step_taken;
-        }
-
-        dgdg = 0;
-        dgdx = 0;
-        for (i = 0; i < n; i++)
-        {
-            dgdg += dg[point][i] * dg[point][i];
-            dgdx += dg[point][i] * dx[point][i];
-        }
-
-        diag = dgdx / dgdg;
-
-        rho[point] = 1.0 / dgdx;
-        point++;
-
-        if (point >= nmaxcorr)
-        {
-            point = 0;
-        }
-
-        /* Update */
-        for (i = 0; i < n; i++)
-        {
-            p[i] = ff[i];
-        }
-
-        cp = point;
-
-        /* Recursive update. First go back over the memory points */
-        for (k = 0; k < ncorr; k++)
-        {
-            cp--;
-            if (cp < 0)
-            {
-                cp = ncorr - 1;
-            }
-
-            sq = 0;
-            for (i = 0; i < n; i++)
-            {
-                sq += dx[cp][i] * p[i];
-            }
-
-            alpha[cp] = rho[cp] * sq;
-
-            for (i = 0; i < n; i++)
-            {
-                p[i] -= alpha[cp] * dg[cp][i];
-            }
-        }
-
-        for (i = 0; i < n; i++)
-        {
-            p[i] *= diag;
-        }
-
-        /* And then go forward again */
-        for (k = 0; k < ncorr; k++)
-        {
-            yr = 0;
-            for (i = 0; i < n; i++)
-            {
-                yr += p[i] * dg[cp][i];
-            }
-
-            beta = rho[cp] * yr;
-            beta = alpha[cp] - beta;
-
-            for (i = 0; i < n; i++)
-            {
-                p[i] += beta * dx[cp][i];
-            }
-
-            cp++;
-            if (cp >= ncorr)
-            {
-                cp = 0;
-            }
-        }
-
-        for (i = 0; i < n; i++)
-        {
-            if (!frozen[i])
-            {
-                dx[point][i] = p[i];
-            }
-            else
-            {
-                dx[point][i] = 0;
-            }
-        }
-
-        /* Print it if necessary */
-        if (MASTER(cr))
-        {
-            if (mdrunOptions.verbose)
-            {
-                double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n", step,
-                        ems.epot, ems.fnorm / sqrtNumAtoms, ems.fmax, ems.a_fmax + 1);
-                fflush(stderr);
-            }
-            /* Store the new (lower) energies */
-            matrix nullBox = {};
-            energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(step), mdatoms->tmass,
-                                             enerd, nullptr, nullptr, nullBox, PTCouplingArrays(), 0,
-                                             nullptr, nullptr, vir, pres, nullptr, mu_tot, constr);
-
-            do_log = do_per_step(step, inputrec->nstlog);
-            do_ene = do_per_step(step, inputrec->nstenergy);
-
-            imdSession->fillEnergyRecord(step, TRUE);
-
-            if (do_log)
-            {
-                EnergyOutput::printHeader(fplog, step, step);
-            }
-            energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-                                               do_log ? fplog : nullptr, step, step,
-                                               fr->fcdata.get(), nullptr);
-        }
-
-        /* Send x and E to IMD client, if bIMD is TRUE. */
-        if (imdSession->run(step, TRUE, state_global->box, state_global->x.rvec_array(), 0) && MASTER(cr))
-        {
-            imdSession->sendPositionsAndEnergies();
-        }
-
-        // Reset stepsize in we are doing more iterations
-        stepsize = 1.0;
-
-        /* Stop when the maximum force lies below tolerance.
-         * If we have reached machine precision, converged is already set to true.
-         */
-        converged = converged || (ems.fmax < inputrec->em_tol);
-
-    } /* End of the loop */
-
-    if (converged)
-    {
-        step--; /* we never took that last step in this case */
-    }
-    if (ems.fmax > inputrec->em_tol)
-    {
-        if (MASTER(cr))
-        {
-            warn_step(fplog, inputrec->em_tol, ems.fmax, step - 1 == number_steps, FALSE);
-        }
-        converged = FALSE;
-    }
-
-    /* If we printed energy and/or logfile last step (which was the last step)
-     * we don't have to do it again, but otherwise print the final values.
-     */
-    if (!do_log) /* Write final value to log since we didn't do anythin last step */
-    {
-        EnergyOutput::printHeader(fplog, step, step);
-    }
-    if (!do_ene || !do_log) /* Write final energy file entries */
-    {
-        energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-                                           !do_log ? fplog : nullptr, step, step, fr->fcdata.get(),
-                                           nullptr);
-    }
-
-    /* Print some stuff... */
-    if (MASTER(cr))
-    {
-        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-    }
-
-    /* IMPORTANT!
-     * For accurate normal mode calculation it is imperative that we
-     * store the last conformation into the full precision binary trajectory.
-     *
-     * However, we should only do it if we did NOT already write this step
-     * above (which we did if do_x or do_f was true).
-     */
-    do_x = !do_per_step(step, inputrec->nstxout);
-    do_f = !do_per_step(step, inputrec->nstfout);
-    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm), top_global, inputrec,
-                  step, &ems, state_global, observablesHistory);
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged, number_steps, &ems, sqrtNumAtoms);
-        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged, number_steps, &ems, sqrtNumAtoms);
-
-        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-    }
-
-    finish_em(cr, outf, walltime_accounting, wcycle);
-
-    /* To print the actual number of steps we needed somewhere */
-    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-}
-
-void LegacySimulator::do_steep()
-{
-    const char*       SD = "Steepest Descents";
-    gmx_localtop_t    top(top_global->ffparams);
-    gmx_global_stat_t gstat;
-    real              stepsize;
-    real              ustep;
-    gmx_bool          bDone, bAbort, do_x, do_f;
-    tensor            vir, pres;
-    rvec              mu_tot = { 0 };
-    int               nsteps;
-    int               count          = 0;
-    int               steps_accepted = 0;
-    auto              mdatoms        = mdAtoms->mdatoms();
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that activating steepest-descent energy minimization via the "
-                    "integrator .mdp option and the command gmx mdrun may "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx minimize and an .mdp option.");
-
-    /* Create 2 states on the stack and extract pointers that we will swap */
-    em_state_t  s0{}, s1{};
-    em_state_t* s_min = &s0;
-    em_state_t* s_try = &s1;
-
-    /* Init em and store the local state in s_try */
-    init_em(fplog, mdlog, SD, cr, inputrec, imdSession, pull_work, state_global, top_global, s_try,
-            &top, nrnb, fr, mdAtoms, &gstat, vsite, constr, nullptr);
-    const bool        simulationsShareState = false;
-    gmx_mdoutf*       outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work,
-                                   nullptr, false, StartingBehavior::NewSimulation,
-                                   simulationsShareState, mdModulesNotifier);
-
-    /* Print to log file  */
-    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
-
-    /* Set variables for stepsize (in nm). This is the largest
-     * step that we are going to make in any direction.
-     */
-    ustep    = inputrec->em_stepsize;
-    stepsize = 0;
-
-    /* Max number of steps  */
-    nsteps = inputrec->nsteps;
-
-    if (MASTER(cr))
-    {
-        /* Print to the screen  */
-        sp_header(stderr, SD, inputrec->em_tol, nsteps);
-    }
-    if (fplog)
-    {
-        sp_header(fplog, SD, inputrec->em_tol, nsteps);
-    }
-    EnergyEvaluator energyEvaluator{ fplog,    mdlog,      cr,        ms,   top_global,      &top,
-                                     inputrec, imdSession, pull_work, nrnb, wcycle,          gstat,
-                                     vsite,    constr,     mdAtoms,   fr,   runScheduleWork, enerd };
-
-    /**** HERE STARTS THE LOOP ****
-     * count is the counter for the number of steps
-     * bDone will be TRUE when the minimization has converged
-     * bAbort will be TRUE when nsteps steps have been performed or when
-     * the stepsize becomes smaller than is reasonable for machine precision
-     */
-    count  = 0;
-    bDone  = FALSE;
-    bAbort = FALSE;
-    while (!bDone && !bAbort)
-    {
-        bAbort = (nsteps >= 0) && (count == nsteps);
-
-        /* set new coordinates, except for first step */
-        bool validStep = true;
-        if (count > 0)
-        {
-            validStep = do_em_step(cr, inputrec, mdatoms, s_min, stepsize,
-                                   s_min->f.view().forceWithPadding(), s_try, constr, count);
-        }
-
-        if (validStep)
-        {
-            energyEvaluator.run(s_try, mu_tot, vir, pres, count, count == 0);
-        }
-        else
-        {
-            // Signal constraint error during stepping with energy=inf
-            s_try->epot = std::numeric_limits<real>::infinity();
-        }
-
-        if (MASTER(cr))
-        {
-            EnergyOutput::printHeader(fplog, count, count);
-        }
-
-        if (count == 0)
-        {
-            s_min->epot = s_try->epot;
-        }
-
-        /* Print it if necessary  */
-        if (MASTER(cr))
-        {
-            if (mdrunOptions.verbose)
-            {
-                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
-                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax + 1,
-                        ((count == 0) || (s_try->epot < s_min->epot)) ? '\n' : '\r');
-                fflush(stderr);
-            }
-
-            if ((count == 0) || (s_try->epot < s_min->epot))
-            {
-                /* Store the new (lower) energies  */
-                matrix nullBox = {};
-                energyOutput.addDataAtEnergyStep(false, false, static_cast<double>(count),
-                                                 mdatoms->tmass, enerd, nullptr, nullptr, nullBox,
-                                                 PTCouplingArrays(), 0, nullptr, nullptr, vir, pres,
-                                                 nullptr, mu_tot, constr);
-
-                imdSession->fillEnergyRecord(count, TRUE);
-
-                const bool do_dr = do_per_step(steps_accepted, inputrec->nstdisreout);
-                const bool do_or = do_per_step(steps_accepted, inputrec->nstorireout);
-                energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), TRUE, do_dr, do_or,
-                                                   fplog, count, count, fr->fcdata.get(), nullptr);
-                fflush(fplog);
-            }
-        }
-
-        /* Now if the new energy is smaller than the previous...
-         * or if this is the first step!
-         * or if we did random steps!
-         */
-
-        if ((count == 0) || (s_try->epot < s_min->epot))
-        {
-            steps_accepted++;
-
-            /* Test whether the convergence criterion is met...  */
-            bDone = (s_try->fmax < inputrec->em_tol);
-
-            /* Copy the arrays for force, positions and energy  */
-            /* The 'Min' array always holds the coords and forces of the minimal
-               sampled energy  */
-            swap_em_state(&s_min, &s_try);
-            if (count > 0)
-            {
-                ustep *= 1.2;
-            }
-
-            /* Write to trn, if necessary */
-            do_x = do_per_step(steps_accepted, inputrec->nstxout);
-            do_f = do_per_step(steps_accepted, inputrec->nstfout);
-            write_em_traj(fplog, cr, outf, do_x, do_f, nullptr, top_global, inputrec, count, s_min,
-                          state_global, observablesHistory);
-        }
-        else
-        {
-            /* If energy is not smaller make the step smaller...  */
-            ustep *= 0.5;
-
-            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-            {
-                /* Reload the old state */
-                em_dd_partition_system(fplog, mdlog, count, cr, top_global, inputrec, imdSession,
-                                       pull_work, s_min, &top, mdAtoms, fr, vsite, constr, nrnb, wcycle);
-            }
-        }
-
-        // If the force is very small after finishing minimization,
-        // we risk dividing by zero when calculating the step size.
-        // So we check first if the minimization has stopped before
-        // trying to obtain a new step size.
-        if (!bDone)
-        {
-            /* Determine new step  */
-            stepsize = ustep / s_min->fmax;
-        }
-
-        /* Check if stepsize is too small, with 1 nm as a characteristic length */
-#if GMX_DOUBLE
-        if (count == nsteps || ustep < 1e-12)
-#else
-        if (count == nsteps || ustep < 1e-6)
-#endif
-        {
-            if (MASTER(cr))
-            {
-                warn_step(fplog, inputrec->em_tol, s_min->fmax, count == nsteps, constr != nullptr);
-            }
-            bAbort = TRUE;
-        }
-
-        /* Send IMD energies and positions, if bIMD is TRUE. */
-        if (imdSession->run(count, TRUE, MASTER(cr) ? state_global->box : nullptr,
-                            MASTER(cr) ? state_global->x.rvec_array() : nullptr, 0)
-            && MASTER(cr))
-        {
-            imdSession->sendPositionsAndEnergies();
-        }
-
-        count++;
-    } /* End of the loop  */
-
-    /* Print some data...  */
-    if (MASTER(cr))
-    {
-        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-    }
-    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout != 0, ftp2fn(efSTO, nfile, fnm),
-                  top_global, inputrec, count, s_min, state_global, observablesHistory);
-
-    if (MASTER(cr))
-    {
-        double sqrtNumAtoms = sqrt(static_cast<double>(state_global->natoms));
-
-        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps, s_min, sqrtNumAtoms);
-        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps, s_min, sqrtNumAtoms);
-    }
-
-    finish_em(cr, outf, walltime_accounting, wcycle);
-
-    /* To print the actual number of steps we needed somewhere */
-    inputrec->nsteps = count;
-
-    walltime_accounting_set_nsteps_done(walltime_accounting, count);
-}
-
-void LegacySimulator::do_nm()
-{
-    const char*         NM = "Normal Mode Analysis";
-    int                 nnodes;
-    gmx_localtop_t      top(top_global->ffparams);
-    gmx_global_stat_t   gstat;
-    tensor              vir, pres;
-    rvec                mu_tot = { 0 };
-    rvec*               dfdx;
-    gmx_bool            bSparse; /* use sparse matrix storage format */
-    size_t              sz;
-    gmx_sparsematrix_t* sparse_matrix = nullptr;
-    real*               full_matrix   = nullptr;
-
-    /* added with respect to mdrun */
-    int  row, col;
-    real der_range = 10.0 * std::sqrt(GMX_REAL_EPS);
-    real x_min;
-    bool bIsMaster = MASTER(cr);
-    auto mdatoms   = mdAtoms->mdatoms();
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that activating normal-mode analysis via the integrator "
-                    ".mdp option and the command gmx mdrun may "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx normal-modes.");
-
-    if (constr != nullptr)
-    {
-        gmx_fatal(
-                FARGS,
-                "Constraints present with Normal Mode Analysis, this combination is not supported");
-    }
-
-    gmx_shellfc_t* shellfc;
-
-    em_state_t state_work{};
-
-    /* Init em and store the local state in state_minimum */
-    init_em(fplog, mdlog, NM, cr, inputrec, imdSession, pull_work, state_global, top_global,
-            &state_work, &top, nrnb, fr, mdAtoms, &gstat, vsite, constr, &shellfc);
-    const bool  simulationsShareState = false;
-    gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-
-    std::vector<int>       atom_index = get_atom_index(top_global);
-    std::vector<gmx::RVec> fneg(atom_index.size(), { 0, 0, 0 });
-    snew(dfdx, atom_index.size());
-
-#if !GMX_DOUBLE
-    if (bIsMaster)
-    {
-        fprintf(stderr,
-                "NOTE: This version of GROMACS has been compiled in single precision,\n"
-                "      which MIGHT not be accurate enough for normal mode analysis.\n"
-                "      GROMACS now uses sparse matrix storage, so the memory requirements\n"
-                "      are fairly modest even if you recompile in double precision.\n\n");
-    }
-#endif
-
-    /* Check if we can/should use sparse storage format.
-     *
-     * Sparse format is only useful when the Hessian itself is sparse, which it
-     * will be when we use a cutoff.
-     * For small systems (n<1000) it is easier to always use full matrix format, though.
-     */
-    if (EEL_FULL(fr->ic->eeltype) || fr->rlist == 0.0)
-    {
-        GMX_LOG(mdlog.warning)
-                .appendText("Non-cutoff electrostatics used, forcing full Hessian format.");
-        bSparse = FALSE;
-    }
-    else if (atom_index.size() < 1000)
-    {
-        GMX_LOG(mdlog.warning)
-                .appendTextFormatted("Small system size (N=%zu), using full Hessian format.",
-                                     atom_index.size());
-        bSparse = FALSE;
-    }
-    else
-    {
-        GMX_LOG(mdlog.warning).appendText("Using compressed symmetric sparse Hessian format.");
-        bSparse = TRUE;
-    }
-
-    /* Number of dimensions, based on real atoms, that is not vsites or shell */
-    sz = DIM * atom_index.size();
-
-    fprintf(stderr, "Allocating Hessian memory...\n\n");
-
-    if (bSparse)
-    {
-        sparse_matrix                       = gmx_sparsematrix_init(sz);
-        sparse_matrix->compressed_symmetric = TRUE;
-    }
-    else
-    {
-        snew(full_matrix, sz * sz);
-    }
-
-    /* Write start time and temperature */
-    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
-
-    /* fudge nr of steps to nr of atoms */
-    inputrec->nsteps = atom_index.size() * 2;
-
-    if (bIsMaster)
-    {
-        fprintf(stderr, "starting normal mode calculation '%s'\n%" PRId64 " steps.\n\n",
-                *(top_global->name), inputrec->nsteps);
-    }
-
-    nnodes = cr->nnodes;
-
-    /* Make evaluate_energy do a single node force calculation */
-    cr->nnodes = 1;
-    EnergyEvaluator energyEvaluator{ fplog,    mdlog,      cr,        ms,   top_global,      &top,
-                                     inputrec, imdSession, pull_work, nrnb, wcycle,          gstat,
-                                     vsite,    constr,     mdAtoms,   fr,   runScheduleWork, enerd };
-    energyEvaluator.run(&state_work, mu_tot, vir, pres, -1, TRUE);
-    cr->nnodes = nnodes;
-
-    /* if forces are not small, warn user */
-    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, &state_work);
-
-    GMX_LOG(mdlog.warning).appendTextFormatted("Maximum force:%12.5e", state_work.fmax);
-    if (state_work.fmax > 1.0e-3)
-    {
-        GMX_LOG(mdlog.warning)
-                .appendText(
-                        "The force is probably not small enough to "
-                        "ensure that you are at a minimum.\n"
-                        "Be aware that negative eigenvalues may occur\n"
-                        "when the resulting matrix is diagonalized.");
-    }
-
-    /***********************************************************
-     *
-     *      Loop over all pairs in matrix
-     *
-     *      do_force called twice. Once with positive and
-     *      once with negative displacement
-     *
-     ************************************************************/
-
-    /* Steps are divided one by one over the nodes */
-    bool bNS          = true;
-    auto state_work_x = makeArrayRef(state_work.s.x);
-    auto state_work_f = state_work.f.view().force();
-    for (index aid = cr->nodeid; aid < ssize(atom_index); aid += nnodes)
-    {
-        size_t atom = atom_index[aid];
-        for (size_t d = 0; d < DIM; d++)
-        {
-            int64_t step        = 0;
-            int     force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES;
-            double  t           = 0;
-
-            x_min = state_work_x[atom][d];
-
-            for (unsigned int dx = 0; (dx < 2); dx++)
-            {
-                if (dx == 0)
-                {
-                    state_work_x[atom][d] = x_min - der_range;
-                }
-                else
-                {
-                    state_work_x[atom][d] = x_min + der_range;
-                }
-
-                /* Make evaluate_energy do a single node force calculation */
-                cr->nnodes = 1;
-                if (shellfc)
-                {
-                    /* Now is the time to relax the shells */
-                    relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, nullptr, step, inputrec,
-                                        imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
-                                        state_work.s.natoms, state_work.s.x.arrayRefWithPadding(),
-                                        state_work.s.v.arrayRefWithPadding(), state_work.s.box,
-                                        state_work.s.lambda, &state_work.s.hist, &state_work.f.view(),
-                                        vir, mdatoms, nrnb, wcycle, shellfc, fr, runScheduleWork, t,
-                                        mu_tot, vsite, DDBalanceRegionHandler(nullptr));
-                    bNS = false;
-                    step++;
-                }
-                else
-                {
-                    energyEvaluator.run(&state_work, mu_tot, vir, pres, aid * 2 + dx, FALSE);
-                }
-
-                cr->nnodes = nnodes;
-
-                if (dx == 0)
-                {
-                    std::copy(state_work_f.begin(), state_work_f.begin() + atom_index.size(),
-                              fneg.begin());
-                }
-            }
-
-            /* x is restored to original */
-            state_work_x[atom][d] = x_min;
-
-            for (size_t j = 0; j < atom_index.size(); j++)
-            {
-                for (size_t k = 0; (k < DIM); k++)
-                {
-                    dfdx[j][k] = -(state_work_f[atom_index[j]][k] - fneg[j][k]) / (2 * der_range);
-                }
-            }
-
-            if (!bIsMaster)
-            {
-#if GMX_MPI
-#    define mpi_type GMX_MPI_REAL
-                MPI_Send(dfdx[0], atom_index.size() * DIM, mpi_type, MASTER(cr), cr->nodeid,
-                         cr->mpi_comm_mygroup);
-#endif
-            }
-            else
-            {
-                for (index node = 0; (node < nnodes && aid + node < ssize(atom_index)); node++)
-                {
-                    if (node > 0)
-                    {
-#if GMX_MPI
-                        MPI_Status stat;
-                        MPI_Recv(dfdx[0], atom_index.size() * DIM, mpi_type, node, node,
-                                 cr->mpi_comm_mygroup, &stat);
-#    undef mpi_type
-#endif
-                    }
-
-                    row = (aid + node) * DIM + d;
-
-                    for (size_t j = 0; j < atom_index.size(); j++)
-                    {
-                        for (size_t k = 0; k < DIM; k++)
-                        {
-                            col = j * DIM + k;
-
-                            if (bSparse)
-                            {
-                                if (col >= row && dfdx[j][k] != 0.0)
-                                {
-                                    gmx_sparsematrix_increment_value(sparse_matrix, row, col, dfdx[j][k]);
-                                }
-                            }
-                            else
-                            {
-                                full_matrix[row * sz + col] = dfdx[j][k];
-                            }
-                        }
-                    }
-                }
-            }
-
-            if (mdrunOptions.verbose && fplog)
-            {
-                fflush(fplog);
-            }
-        }
-        /* write progress */
-        if (bIsMaster && mdrunOptions.verbose)
-        {
-            fprintf(stderr, "\rFinished step %d out of %td",
-                    std::min<int>(atom + nnodes, atom_index.size()), ssize(atom_index));
-            fflush(stderr);
-        }
-    }
-
-    if (bIsMaster)
-    {
-        fprintf(stderr, "\n\nWriting Hessian...\n");
-        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
-    }
-
-    finish_em(cr, outf, walltime_accounting, wcycle);
-
-    walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size() * 2);
-}
-
-} // namespace gmx
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.cpp b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.cpp
deleted file mode 100644
index a75917ae09..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.cpp
+++ /dev/null
@@ -1,1493 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/*! \internal \file
- *
- * \brief Implements the replica exchange routines.
- *
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \author Mark Abraham <mark.j.abraham@gmail.com>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include "replicaexchange.h"
-
-#include "config.h"
-
-#include <cmath>
-
-#include <random>
-
-#include "gromacs/domdec/collect.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdrunutility/multisim.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/enerdata.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/random/threefry.h"
-#include "gromacs/random/uniformintdistribution.h"
-#include "gromacs/random/uniformrealdistribution.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/pleasecite.h"
-#include "gromacs/utility/smalloc.h"
-
-
-/* PLUMED */
-#include "../../../Plumed.h"
-extern int    plumedswitch;
-extern plumed plumedmain;
-/* END PLUMED */
-
-/* PLUMED HREX */
-extern int plumed_hrex;
-/* END PLUMED HREX */
-
-//! Helps cut off probability values.
-constexpr int c_probabilityCutoff = 100;
-
-/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
-
-//! Rank in the multisimulation
-#define MSRANK(ms, nodeid) (nodeid)
-
-//! Enum for replica exchange flavours
-enum
-{
-    ereTEMP,
-    ereLAMBDA,
-    ereENDSINGLE,
-    ereTL,
-    ereNR
-};
-/*! \brief Strings describing replica exchange flavours.
- *
- *  end_single_marker merely notes the end of single variable replica
- *  exchange. All types higher than it are multiple replica exchange
- *  methods.
- *
- * Eventually, should add 'pressure', 'temperature and pressure',
- *  'lambda_and_pressure', 'temperature_lambda_pressure'?; Let's wait
- *  until we feel better about the pressure control methods giving
- *  exact ensembles.  Right now, we assume constant pressure */
-static const char* erename[ereNR] = { "temperature", "lambda", "end_single_marker",
-                                      "temperature and lambda" };
-
-//! Working data for replica exchange.
-struct gmx_repl_ex
-{
-    //! Replica ID
-    int repl;
-    //! Total number of replica
-    int nrepl;
-    //! Temperature
-    real temp;
-    //! Replica exchange type from ere enum
-    int type;
-    //! Quantity, e.g. temperature or lambda; first index is ere, second index is replica ID
-    real** q;
-    //! Use constant pressure and temperature
-    gmx_bool bNPT;
-    //! Replica pressures
-    real* pres;
-    //! Replica indices
-    int* ind;
-    //! Used for keeping track of all the replica swaps
-    int* allswaps;
-    //! Replica exchange interval (number of steps)
-    int nst;
-    //! Number of exchanges per interval
-    int nex;
-    //! Random seed
-    int seed;
-    //! Number of even and odd replica change attempts
-    int nattempt[2];
-    //! Sum of probabilities
-    real* prob_sum;
-    //! Number of moves between replicas i and j
-    int** nmoves;
-    //! i-th element of the array is the number of exchanges between replica i-1 and i
-    int* nexchange;
-
-    /*! \brief Helper arrays for replica exchange; allocated here
-     * so they don't have to be allocated each time */
-    //! \{
-    int*      destinations;
-    int**     cyclic;
-    int**     order;
-    int*      tmpswap;
-    gmx_bool* incycle;
-    gmx_bool* bEx;
-    //! \}
-
-    //! Helper arrays to hold the quantities that are exchanged.
-    //! \{
-    real*  prob;
-    real*  Epot;
-    real*  beta;
-    real*  Vol;
-    real** de;
-    //! \}
-};
-
-// TODO We should add Doxygen here some time.
-//! \cond
-
-static gmx_bool repl_quantity(const gmx_multisim_t* ms, struct gmx_repl_ex* re, int ere, real q)
-{
-    real*    qall;
-    gmx_bool bDiff;
-    int      s;
-
-    snew(qall, ms->numSimulations_);
-    qall[re->repl] = q;
-    gmx_sum_sim(ms->numSimulations_, qall, ms);
-
-    /* PLUMED */
-    //bDiff = FALSE;
-    //for (s = 1; s < ms->numSimulations_; s++)
-    //{
-    //    if (qall[s] != qall[0])
-    //    {
-            bDiff = TRUE;
-    //    }
-    //}
-    /* PLUMED */
-
-    if (bDiff)
-    {
-        /* Set the replica exchange type and quantities */
-        re->type = ere;
-
-        snew(re->q[ere], re->nrepl);
-        for (s = 0; s < ms->numSimulations_; s++)
-        {
-            re->q[ere][s] = qall[s];
-        }
-    }
-    sfree(qall);
-    return bDiff;
-}
-
-gmx_repl_ex_t init_replica_exchange(FILE*                            fplog,
-                                    const gmx_multisim_t*            ms,
-                                    int                              numAtomsInSystem,
-                                    const t_inputrec*                ir,
-                                    const ReplicaExchangeParameters& replExParams)
-{
-    real                pres;
-    int                 i, j;
-    struct gmx_repl_ex* re;
-    gmx_bool            bTemp;
-    gmx_bool            bLambda = FALSE;
-
-    fprintf(fplog, "\nInitializing Replica Exchange\n");
-
-    if (!isMultiSim(ms) || ms->numSimulations_ == 1)
-    {
-        gmx_fatal(FARGS,
-                  "Nothing to exchange with only one replica, maybe you forgot to set the "
-                  "-multidir option of mdrun?");
-    }
-    if (replExParams.numExchanges < 0)
-    {
-        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
-    }
-
-    if (!EI_DYNAMICS(ir->eI))
-    {
-        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
-        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
-         * distinct from isMultiSim(ms). A multi-simulation only runs
-         * with real MPI parallelism, but this does not imply PAR(cr)
-         * is true!
-         *
-         * Since we are using a dynamical integrator, the only
-         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
-         * synonymous. The only way for cr->nnodes > 1 to be true is
-         * if we are using DD. */
-    }
-
-    snew(re, 1);
-
-    re->repl  = ms->simulationIndex_;
-    re->nrepl = ms->numSimulations_;
-    snew(re->q, ereENDSINGLE);
-
-    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
-
-    /* We only check that the number of atoms in the systms match.
-     * This, of course, do not guarantee that the systems are the same,
-     * but it does guarantee that we can perform replica exchange.
-     */
-    check_multi_int(fplog, ms, numAtomsInSystem, "the number of atoms", FALSE);
-    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
-    check_multi_int64(fplog, ms, ir->init_step + ir->nsteps, "init_step+nsteps", FALSE);
-    const int nst = replExParams.exchangeInterval;
-    check_multi_int64(fplog, ms, (ir->init_step + nst - 1) / nst,
-                      "first exchange step: init_step/-replex", FALSE);
-    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
-    check_multi_int(fplog, ms, ir->opts.ngtc, "the number of temperature coupling groups", FALSE);
-    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
-    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
-    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
-
-    re->temp = ir->opts.ref_t[0];
-    for (i = 1; (i < ir->opts.ngtc); i++)
-    {
-        if (ir->opts.ref_t[i] != re->temp)
-        {
-            fprintf(fplog,
-                    "\nWARNING: The temperatures of the different temperature coupling groups are "
-                    "not identical\n\n");
-            fprintf(stderr,
-                    "\nWARNING: The temperatures of the different temperature coupling groups are "
-                    "not identical\n\n");
-        }
-    }
-
-    re->type = -1;
-    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
-    if (ir->efep != efepNO)
-    {
-        bLambda = repl_quantity(ms, re, ereLAMBDA, static_cast<real>(ir->fepvals->init_fep_state));
-    }
-    if (re->type == -1) /* nothing was assigned */
-    {
-        gmx_fatal(FARGS,
-                  "The properties of the %d systems are all the same, there is nothing to exchange",
-                  re->nrepl);
-    }
-    if (bLambda && bTemp)
-    {
-        re->type = ereTL;
-    }
-
-    if (bTemp)
-    {
-        please_cite(fplog, "Sugita1999a");
-        if (ir->epc != epcNO)
-        {
-            re->bNPT = TRUE;
-            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
-            please_cite(fplog, "Okabe2001a");
-        }
-        if (ir->etc == etcBERENDSEN)
-        {
-            gmx_fatal(FARGS,
-                      "REMD with the %s thermostat does not produce correct potential energy "
-                      "distributions, consider using the %s thermostat instead",
-                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
-        }
-    }
-    if (bLambda)
-    {
-        if (ir->fepvals->delta_lambda != 0) /* check this? */
-        {
-            gmx_fatal(FARGS, "delta_lambda is not zero");
-        }
-    }
-    if (re->bNPT)
-    {
-        snew(re->pres, re->nrepl);
-        if (ir->epct == epctSURFACETENSION)
-        {
-            pres = ir->ref_p[ZZ][ZZ];
-        }
-        else
-        {
-            pres = 0;
-            j    = 0;
-            for (i = 0; i < DIM; i++)
-            {
-                if (ir->compress[i][i] != 0)
-                {
-                    pres += ir->ref_p[i][i];
-                    j++;
-                }
-            }
-            pres /= j;
-        }
-        re->pres[re->repl] = pres;
-        gmx_sum_sim(re->nrepl, re->pres, ms);
-    }
-
-    /* Make an index for increasing replica order */
-    /* only makes sense if one or the other is varying, not both!
-       if both are varying, we trust the order the person gave. */
-    snew(re->ind, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        re->ind[i] = i;
-    }
-
-    /* PLUMED */
-    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
-    // in those cases replicas can share the same temperature.
-    /*
-    if (re->type < ereENDSINGLE)
-    {
-
-        for (i = 0; i < re->nrepl; i++)
-        {
-            for (j = i + 1; j < re->nrepl; j++)
-            {
-                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
-                {*/
-                    /* Unordered replicas are supposed to work, but there
-                     * is still an issues somewhere.
-                     * Note that at this point still re->ind[i]=i.
-                     */
-                 /*
-                    gmx_fatal(FARGS,
-                              "Replicas with indices %d < %d have %ss %g > %g, please order your "
-                              "replicas on increasing %s",
-                              i, j, erename[re->type], re->q[re->type][i], re->q[re->type][j],
-                              erename[re->type]);
-                }
-                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
-                {
-                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
-                }
-            }
-        }
-    }
-    */
-    /* END PLUMED */
-
-    /* keep track of all the swaps, starting with the initial placement. */
-    snew(re->allswaps, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        re->allswaps[i] = re->ind[i];
-    }
-
-    switch (re->type)
-    {
-        case ereTEMP:
-            fprintf(fplog, "\nReplica exchange in temperature\n");
-            for (i = 0; i < re->nrepl; i++)
-            {
-                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
-            }
-            fprintf(fplog, "\n");
-            break;
-        case ereLAMBDA:
-            fprintf(fplog, "\nReplica exchange in lambda\n");
-            for (i = 0; i < re->nrepl; i++)
-            {
-                fprintf(fplog, " %3d", static_cast<int>(re->q[re->type][re->ind[i]]));
-            }
-            fprintf(fplog, "\n");
-            break;
-        case ereTL:
-            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
-            for (i = 0; i < re->nrepl; i++)
-            {
-                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
-            }
-            fprintf(fplog, "\n");
-            for (i = 0; i < re->nrepl; i++)
-            {
-                fprintf(fplog, " %5d", static_cast<int>(re->q[ereLAMBDA][re->ind[i]]));
-            }
-            fprintf(fplog, "\n");
-            break;
-        default: gmx_incons("Unknown replica exchange quantity");
-    }
-    if (re->bNPT)
-    {
-        fprintf(fplog, "\nRepl  p");
-        for (i = 0; i < re->nrepl; i++)
-        {
-            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
-        }
-
-        for (i = 0; i < re->nrepl; i++)
-        {
-            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i - 1]]))
-            {
-                fprintf(fplog,
-                        "\nWARNING: The reference pressures decrease with increasing "
-                        "temperatures\n\n");
-                fprintf(stderr,
-                        "\nWARNING: The reference pressures decrease with increasing "
-                        "temperatures\n\n");
-            }
-        }
-    }
-    re->nst = nst;
-    if (replExParams.randomSeed == -1)
-    {
-        if (isMasterSim(ms))
-        {
-            re->seed = static_cast<int>(gmx::makeRandomSeed());
-        }
-        else
-        {
-            re->seed = 0;
-        }
-        gmx_sumi_sim(1, &(re->seed), ms);
-    }
-    else
-    {
-        re->seed = replExParams.randomSeed;
-    }
-    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
-    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
-
-    re->nattempt[0] = 0;
-    re->nattempt[1] = 0;
-
-    snew(re->prob_sum, re->nrepl);
-    snew(re->nexchange, re->nrepl);
-    snew(re->nmoves, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        snew(re->nmoves[i], re->nrepl);
-    }
-    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
-
-    /* generate space for the helper functions so we don't have to snew each time */
-
-    snew(re->destinations, re->nrepl);
-    snew(re->incycle, re->nrepl);
-    snew(re->tmpswap, re->nrepl);
-    snew(re->cyclic, re->nrepl);
-    snew(re->order, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        snew(re->cyclic[i], re->nrepl + 1);
-        snew(re->order[i], re->nrepl);
-    }
-    /* allocate space for the functions storing the data for the replicas */
-    /* not all of these arrays needed in all cases, but they don't take
-       up much space, since the max size is nrepl**2 */
-    snew(re->prob, re->nrepl);
-    snew(re->bEx, re->nrepl);
-    snew(re->beta, re->nrepl);
-    snew(re->Vol, re->nrepl);
-    snew(re->Epot, re->nrepl);
-    snew(re->de, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        snew(re->de[i], re->nrepl);
-    }
-    re->nex = replExParams.numExchanges;
-    return re;
-}
-
-static void exchange_reals(const gmx_multisim_t gmx_unused* ms, int gmx_unused b, real* v, int n)
-{
-    real* buf;
-    int   i;
-
-    if (v)
-    {
-        snew(buf, n);
-#if GMX_MPI
-        /*
-           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-           ms->mastersComm_,MPI_STATUS_IGNORE);
-         */
-        {
-            MPI_Request mpi_req;
-
-            MPI_Isend(v, n * sizeof(real), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_, &mpi_req);
-            MPI_Recv(buf, n * sizeof(real), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_, MPI_STATUS_IGNORE);
-            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-        }
-#endif
-        for (i = 0; i < n; i++)
-        {
-            v[i] = buf[i];
-        }
-        sfree(buf);
-    }
-}
-
-
-static void exchange_doubles(const gmx_multisim_t gmx_unused* ms, int gmx_unused b, double* v, int n)
-{
-    double* buf;
-    int     i;
-
-    if (v)
-    {
-        snew(buf, n);
-#if GMX_MPI
-        /*
-           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-           ms->mastersComm_,MPI_STATUS_IGNORE);
-         */
-        {
-            MPI_Request mpi_req;
-
-            MPI_Isend(v, n * sizeof(double), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_, &mpi_req);
-            MPI_Recv(buf, n * sizeof(double), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_,
-                     MPI_STATUS_IGNORE);
-            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-        }
-#endif
-        for (i = 0; i < n; i++)
-        {
-            v[i] = buf[i];
-        }
-        sfree(buf);
-    }
-}
-
-static void exchange_rvecs(const gmx_multisim_t gmx_unused* ms, int gmx_unused b, rvec* v, int n)
-{
-    rvec* buf;
-    int   i;
-
-    if (v)
-    {
-        snew(buf, n);
-#if GMX_MPI
-        /*
-           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-           ms->mastersComm_,MPI_STATUS_IGNORE);
-         */
-        {
-            MPI_Request mpi_req;
-
-            MPI_Isend(v[0], n * sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_, &mpi_req);
-            MPI_Recv(buf[0], n * sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_,
-                     MPI_STATUS_IGNORE);
-            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-        }
-#endif
-        for (i = 0; i < n; i++)
-        {
-            copy_rvec(buf[i], v[i]);
-        }
-        sfree(buf);
-    }
-}
-
-/* PLUMED HREX */
-void exchange_state(const gmx_multisim_t* ms, int b, t_state* state)
-/* END PLUMED HREX */
-{
-    /* When t_state changes, this code should be updated. */
-    int ngtc, nnhpres;
-    ngtc    = state->ngtc * state->nhchainlength;
-    nnhpres = state->nnhpres * state->nhchainlength;
-    exchange_rvecs(ms, b, state->box, DIM);
-    exchange_rvecs(ms, b, state->box_rel, DIM);
-    exchange_rvecs(ms, b, state->boxv, DIM);
-    exchange_reals(ms, b, &(state->veta), 1);
-    exchange_reals(ms, b, &(state->vol0), 1);
-    exchange_rvecs(ms, b, state->svir_prev, DIM);
-    exchange_rvecs(ms, b, state->fvir_prev, DIM);
-    exchange_rvecs(ms, b, state->pres_prev, DIM);
-    exchange_doubles(ms, b, state->nosehoover_xi.data(), ngtc);
-    exchange_doubles(ms, b, state->nosehoover_vxi.data(), ngtc);
-    exchange_doubles(ms, b, state->nhpres_xi.data(), nnhpres);
-    exchange_doubles(ms, b, state->nhpres_vxi.data(), nnhpres);
-    exchange_doubles(ms, b, state->therm_integral.data(), state->ngtc);
-    exchange_doubles(ms, b, &state->baros_integral, 1);
-    exchange_rvecs(ms, b, state->x.rvec_array(), state->natoms);
-    exchange_rvecs(ms, b, state->v.rvec_array(), state->natoms);
-}
-
-/* PLUMED HREX */
-void copy_state_serial(const t_state* src, t_state* dest)
-/* END PLUMED HREX */
-{
-    if (dest != src)
-    {
-        /* Currently the local state is always a pointer to the global
-         * in serial, so we should never end up here.
-         * TODO: Implement a (trivial) t_state copy once converted to C++.
-         */
-        GMX_RELEASE_ASSERT(false, "State copying is currently not implemented in replica exchange");
-    }
-}
-
-static void scale_velocities(gmx::ArrayRef<gmx::RVec> velocities, real fac)
-{
-    for (auto& v : velocities)
-    {
-        v *= fac;
-    }
-}
-
-static void print_transition_matrix(FILE* fplog, int n, int** nmoves, const int* nattempt)
-{
-    int   i, j, ntot;
-    float Tprint;
-
-    ntot = nattempt[0] + nattempt[1];
-    fprintf(fplog, "\n");
-    fprintf(fplog, "Repl");
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "    "); /* put the title closer to the center */
-    }
-    fprintf(fplog, "Empirical Transition Matrix\n");
-
-    fprintf(fplog, "Repl");
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "%8d", (i + 1));
-    }
-    fprintf(fplog, "\n");
-
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "Repl");
-        for (j = 0; j < n; j++)
-        {
-            Tprint = 0.0;
-            if (nmoves[i][j] > 0)
-            {
-                Tprint = nmoves[i][j] / (2.0 * ntot);
-            }
-            fprintf(fplog, "%8.4f", Tprint);
-        }
-        fprintf(fplog, "%3d\n", i);
-    }
-}
-
-static void print_ind(FILE* fplog, const char* leg, int n, int* ind, const gmx_bool* bEx)
-{
-    int i;
-
-    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
-    for (i = 1; i < n; i++)
-    {
-        fprintf(fplog, " %c %2d", (bEx != nullptr && bEx[i]) ? 'x' : ' ', ind[i]);
-    }
-    fprintf(fplog, "\n");
-}
-
-static void print_allswitchind(FILE* fplog, int n, int* pind, int* allswaps, int* tmpswap)
-{
-    int i;
-
-    for (i = 0; i < n; i++)
-    {
-        tmpswap[i] = allswaps[i];
-    }
-    for (i = 0; i < n; i++)
-    {
-        allswaps[i] = tmpswap[pind[i]];
-    }
-
-    fprintf(fplog, "\nAccepted Exchanges:   ");
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "%d ", pind[i]);
-    }
-    fprintf(fplog, "\n");
-
-    /* the "Order After Exchange" is the state label corresponding to the configuration that
-       started in state listed in order, i.e.
-
-       3 0 1 2
-
-       means that the:
-       configuration starting in simulation 3 is now in simulation 0,
-       configuration starting in simulation 0 is now in simulation 1,
-       configuration starting in simulation 1 is now in simulation 2,
-       configuration starting in simulation 2 is now in simulation 3
-     */
-    fprintf(fplog, "Order After Exchange: ");
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "%d ", allswaps[i]);
-    }
-    fprintf(fplog, "\n\n");
-}
-
-static void print_prob(FILE* fplog, const char* leg, int n, real* prob)
-{
-    int  i;
-    char buf[8];
-
-    fprintf(fplog, "Repl %2s ", leg);
-    for (i = 1; i < n; i++)
-    {
-        if (prob[i] >= 0)
-        {
-            sprintf(buf, "%4.2f", prob[i]);
-            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf + 1);
-        }
-        else
-        {
-            fprintf(fplog, "     ");
-        }
-    }
-    fprintf(fplog, "\n");
-}
-
-static void print_count(FILE* fplog, const char* leg, int n, int* count)
-{
-    int i;
-
-    fprintf(fplog, "Repl %2s ", leg);
-    for (i = 1; i < n; i++)
-    {
-        fprintf(fplog, " %4d", count[i]);
-    }
-    fprintf(fplog, "\n");
-}
-
-static real calc_delta(FILE* fplog, gmx_bool bPrint, struct gmx_repl_ex* re, int a, int b, int ap, int bp)
-{
-
-    real   ediff, dpV, delta = 0;
-    real*  Epot = re->Epot;
-    real*  Vol  = re->Vol;
-    real** de   = re->de;
-    real*  beta = re->beta;
-
-    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
-       to the non permuted case */
-
-    switch (re->type)
-    {
-        case ereTEMP:
-            /*
-             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
-             */
-            ediff = Epot[b] - Epot[a];
-            delta = -(beta[bp] - beta[ap]) * ediff;
-            break;
-        case ereLAMBDA:
-            /* two cases:  when we are permuted, and not.  */
-            /* non-permuted:
-               ediff =  E_new - E_old
-                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
-                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
-                     =  de[b][a] + de[a][b] */
-
-            /* permuted:
-               ediff =  E_new - E_old
-                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
-                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
-                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
-                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
-                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
-            /* but, in the current code implementation, we flip configurations, not indices . . .
-               So let's examine that.
-                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
-                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
-                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
-                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
-                     So the simple solution is to flip the
-                     position of perturbed and original indices in the tests.
-             */
-
-            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
-            delta = ediff * beta[a]; /* assume all same temperature in this case */
-            break;
-        case ereTL:
-            /* not permuted:  */
-            /* delta =  reduced E_new - reduced E_old
-                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
-                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
-                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
-                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
-                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
-                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
-                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
-            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
-            /* permuted (big breath!) */
-            /*   delta =  reduced E_new - reduced E_old
-                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
-                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
-                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
-                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
-             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
-                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
-             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
-                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
-             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
-            delta = beta[bp] * (de[bp][a] - de[bp][b]) + beta[ap] * (de[ap][b] - de[ap][a])
-                    - (beta[bp] - beta[ap]) * (Epot[b] - Epot[a]);
-            break;
-        default: gmx_incons("Unknown replica exchange quantity");
-    }
-    if (bPrint)
-    {
-        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
-    }
-/* PLUMED HREX */
-/* this is necessary because with plumed HREX the energy contribution is
-   already taken into account */
-    if(plumed_hrex) delta=0.0;
-/* END PLUMED HREX */
-    if (re->bNPT)
-    {
-        /* revist the calculation for 5.0.  Might be some improvements. */
-        dpV = (beta[ap] * re->pres[ap] - beta[bp] * re->pres[bp]) * (Vol[b] - Vol[a]) / PRESFAC;
-        if (bPrint)
-        {
-            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
-        }
-        delta += dpV;
-    }
-    return delta;
-}
-
-static void test_for_replica_exchange(FILE*                 fplog,
-                                      const gmx_multisim_t* ms,
-                                      struct gmx_repl_ex*   re,
-                                      const gmx_enerdata_t* enerd,
-                                      real                  vol,
-                                      int64_t               step,
-                                      real                  time)
-{
-    int                                m, i, j, a, b, ap, bp, i0, i1, tmp;
-    real                               delta = 0;
-    gmx_bool                           bPrint, bMultiEx;
-    gmx_bool*                          bEx      = re->bEx;
-    real*                              prob     = re->prob;
-    int*                               pind     = re->destinations; /* permuted index */
-    gmx_bool                           bEpot    = FALSE;
-    gmx_bool                           bDLambda = FALSE;
-    gmx_bool                           bVol     = FALSE;
-    gmx::ThreeFry2x64<64>              rng(re->seed, gmx::RandomDomain::ReplicaExchange);
-    gmx::UniformRealDistribution<real> uniformRealDist;
-    gmx::UniformIntDistribution<int>   uniformNreplDist(0, re->nrepl - 1);
-
-    bMultiEx = (re->nex > 1); /* multiple exchanges at each state */
-    fprintf(fplog, "Replica exchange at step %" PRId64 " time %.5f\n", step, time);
-
-    if (re->bNPT)
-    {
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->Vol[i] = 0;
-        }
-        bVol              = TRUE;
-        re->Vol[re->repl] = vol;
-    }
-    if ((re->type == ereTEMP || re->type == ereTL))
-    {
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->Epot[i] = 0;
-        }
-        bEpot              = TRUE;
-        re->Epot[re->repl] = enerd->term[F_EPOT];
-        /* temperatures of different states*/
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->beta[i] = 1.0 / (re->q[ereTEMP][i] * BOLTZ);
-        }
-    }
-    else
-    {
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->beta[i] = 1.0 / (re->temp * BOLTZ); /* we have a single temperature */
-        }
-    }
-    if (re->type == ereLAMBDA || re->type == ereTL)
-    {
-        bDLambda = TRUE;
-        /* lambda differences. */
-        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
-           minus the energy of the jth simulation in the jth Hamiltonian */
-        for (i = 0; i < re->nrepl; i++)
-        {
-            for (j = 0; j < re->nrepl; j++)
-            {
-                re->de[i][j] = 0;
-            }
-        }
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->de[i][re->repl] = enerd->foreignLambdaTerms.deltaH(re->q[ereLAMBDA][i]);
-        }
-    }
-
-    /* now actually do the communication */
-    if (bVol)
-    {
-        gmx_sum_sim(re->nrepl, re->Vol, ms);
-    }
-    if (bEpot)
-    {
-        gmx_sum_sim(re->nrepl, re->Epot, ms);
-    }
-    if (bDLambda)
-    {
-        for (i = 0; i < re->nrepl; i++)
-        {
-            gmx_sum_sim(re->nrepl, re->de[i], ms);
-        }
-    }
-
-    /* make a duplicate set of indices for shuffling */
-    for (i = 0; i < re->nrepl; i++)
-    {
-        pind[i] = re->ind[i];
-    }
-
-    rng.restart(step, 0);
-
-    /* PLUMED */
-    int plumed_test_exchange_pattern=0;
-    if(plumed_test_exchange_pattern && plumed_hrex) gmx_fatal(FARGS,"hrex not compatible with ad hoc exchange patterns");
-    /* END PLUMED */
-
-    if (bMultiEx)
-    {
-        /* multiple random switch exchange */
-        int nself = 0;
-
-
-        for (i = 0; i < re->nex + nself; i++)
-        {
-            // For now this is superfluous, but just in case we ever add more
-            // calls in different branches it is safer to always reset the distribution.
-            uniformNreplDist.reset();
-
-            /* randomly select a pair  */
-            /* in theory, could reduce this by identifying only which switches had a nonneglibible
-               probability of occurring (log p > -100) and only operate on those switches */
-            /* find out which state it is from, and what label that state currently has. Likely
-               more work that useful. */
-            i0 = uniformNreplDist(rng);
-            i1 = uniformNreplDist(rng);
-            if (i0 == i1)
-            {
-                nself++;
-                continue; /* self-exchange, back up and do it again */
-            }
-
-            a  = re->ind[i0]; /* what are the indices of these states? */
-            b  = re->ind[i1];
-            ap = pind[i0];
-            bp = pind[i1];
-
-            bPrint = FALSE; /* too noisy */
-            /* calculate the energy difference */
-            /* if the code changes to flip the STATES, rather than the configurations,
-               use the commented version of the code */
-            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
-            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
-
-            /* we actually only use the first space in the prob and bEx array,
-               since there are actually many switches between pairs. */
-
-            if (delta <= 0)
-            {
-                /* accepted */
-                prob[0] = 1;
-                bEx[0]  = TRUE;
-            }
-            else
-            {
-                if (delta > c_probabilityCutoff)
-                {
-                    prob[0] = 0;
-                }
-                else
-                {
-                    prob[0] = exp(-delta);
-                }
-                // roll a number to determine if accepted. For now it is superfluous to
-                // reset, but just in case we ever add more calls in different branches
-                // it is safer to always reset the distribution.
-                uniformRealDist.reset();
-                bEx[0] = uniformRealDist(rng) < prob[0];
-            }
-            re->prob_sum[0] += prob[0];
-
-            if (bEx[0])
-            {
-                /* swap the states */
-                tmp      = pind[i0];
-                pind[i0] = pind[i1];
-                pind[i1] = tmp;
-            }
-        }
-        re->nattempt[0]++; /* keep track of total permutation trials here */
-        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
-    }
-    else
-    {
-        /* standard nearest neighbor replica exchange */
-
-        m = (step / re->nst) % 2;
-        /* PLUMED */
-        if(plumedswitch){
-          int partner=re->repl;
-          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
-          if(plumed_test_exchange_pattern>0){
-            int *list;
-            snew(list,re->nrepl);
-            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
-            plumed_cmd(plumedmain,"getExchangesList",list);
-            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
-            sfree(list);
-          }
-
-          for(i=1; i<re->nrepl; i++) {
-            if (i % 2 != m) continue;
-            a = re->ind[i-1];
-            b = re->ind[i];
-            if(re->repl==a) partner=b;
-            if(re->repl==b) partner=a;
-          }
-          plumed_cmd(plumedmain,"GREX setPartner",&partner);
-          plumed_cmd(plumedmain,"GREX calculate",nullptr);
-          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",nullptr);
-        }
-        /* END PLUMED */
-        for (i = 1; i < re->nrepl; i++)
-        {
-            a = re->ind[i - 1];
-            b = re->ind[i];
-
-            bPrint = (re->repl == a || re->repl == b);
-            if (i % 2 == m)
-            {
-                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-                /* PLUMED */
-                if(plumedswitch){
-                  real adb,bdb,dplumed;
-                  char buf[300];
-                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
-                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
-                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
-                  delta+=dplumed;
-                  if (bPrint)
-                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
-                }
-                /* END PLUMED */
-                if (delta <= 0)
-                {
-                    /* accepted */
-                    prob[i] = 1;
-                    bEx[i]  = TRUE;
-                }
-                else
-                {
-                    if (delta > c_probabilityCutoff)
-                    {
-                        prob[i] = 0;
-                    }
-                    else
-                    {
-                        prob[i] = exp(-delta);
-                    }
-                    // roll a number to determine if accepted. For now it is superfluous to
-                    // reset, but just in case we ever add more calls in different branches
-                    // it is safer to always reset the distribution.
-                    uniformRealDist.reset();
-                    bEx[i] = uniformRealDist(rng) < prob[i];
-                }
-                re->prob_sum[i] += prob[i];
-
-                if (bEx[i])
-                {
-                  /* PLUMED */
-                  if(!plumed_test_exchange_pattern) {
-                    /* standard neighbour swapping */
-                    /* swap these two */
-                    tmp         = pind[i - 1];
-                    pind[i - 1] = pind[i];
-                    pind[i]     = tmp;
-                    re->nexchange[i]++; /* statistics for back compatibility */
-                  } else {
-                    /* alternative swapping patterns */
-                    tmp       = pind[a];
-                    pind[a]   = pind[b];
-                    pind[b]   = tmp;
-                    re->nexchange[i]++;  /* statistics for back compatibility */
-                  }
-                  /* END PLUMED */
-                }
-            }
-            else
-            {
-                prob[i] = -1;
-                bEx[i]  = FALSE;
-            }
-        }
-        /* print some statistics */
-        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
-        print_prob(fplog, "pr", re->nrepl, prob);
-        fprintf(fplog, "\n");
-        re->nattempt[m]++;
-    }
-
-    /* PLUMED */
-    if(plumed_test_exchange_pattern>0) {
-      for (i = 0; i < re->nrepl; i++)
-      {
-          re->ind[i] = i;
-      }
-    }
-    /* END PLUMED */
-
-    /* record which moves were made and accepted */
-    for (i = 0; i < re->nrepl; i++)
-    {
-        re->nmoves[re->ind[i]][pind[i]] += 1;
-        re->nmoves[pind[i]][re->ind[i]] += 1;
-    }
-    fflush(fplog); /* make sure we can see what the last exchange was */
-}
-
-static void cyclic_decomposition(const int* destinations, int** cyclic, gmx_bool* incycle, const int nrepl, int* nswap)
-{
-
-    int i, j, c, p;
-    int maxlen = 1;
-    for (i = 0; i < nrepl; i++)
-    {
-        incycle[i] = FALSE;
-    }
-    for (i = 0; i < nrepl; i++) /* one cycle for each replica */
-    {
-        if (incycle[i])
-        {
-            cyclic[i][0] = -1;
-            continue;
-        }
-        cyclic[i][0] = i;
-        incycle[i]   = TRUE;
-        c            = 1;
-        p            = i;
-        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
-        {
-            p = destinations[p]; /* start permuting */
-            if (p == i)
-            {
-                cyclic[i][c] = -1;
-                if (c > maxlen)
-                {
-                    maxlen = c;
-                }
-                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
-            }
-            else
-            {
-                cyclic[i][c] = p; /* each permutation gives a new member of the cycle */
-                incycle[p]   = TRUE;
-                c++;
-            }
-        }
-    }
-    *nswap = maxlen - 1;
-
-    if (debug)
-    {
-        for (i = 0; i < nrepl; i++)
-        {
-            fprintf(debug, "Cycle %d:", i);
-            for (j = 0; j < nrepl; j++)
-            {
-                if (cyclic[i][j] < 0)
-                {
-                    break;
-                }
-                fprintf(debug, "%2d", cyclic[i][j]);
-            }
-            fprintf(debug, "\n");
-        }
-        fflush(debug);
-    }
-}
-
-static void compute_exchange_order(int** cyclic, int** order, const int nrepl, const int maxswap)
-{
-    int i, j;
-
-    for (j = 0; j < maxswap; j++)
-    {
-        for (i = 0; i < nrepl; i++)
-        {
-            if (cyclic[i][j + 1] >= 0)
-            {
-                order[cyclic[i][j + 1]][j] = cyclic[i][j];
-                order[cyclic[i][j]][j]     = cyclic[i][j + 1];
-            }
-        }
-        for (i = 0; i < nrepl; i++)
-        {
-            if (order[i][j] < 0)
-            {
-                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
-            }
-        }
-    }
-
-    if (debug)
-    {
-        fprintf(debug, "Replica Exchange Order\n");
-        for (i = 0; i < nrepl; i++)
-        {
-            fprintf(debug, "Replica %d:", i);
-            for (j = 0; j < maxswap; j++)
-            {
-                if (order[i][j] < 0)
-                {
-                    break;
-                }
-                fprintf(debug, "%2d", order[i][j]);
-            }
-            fprintf(debug, "\n");
-        }
-        fflush(debug);
-    }
-}
-
-static void prepare_to_do_exchange(struct gmx_repl_ex* re, const int replica_id, int* maxswap, gmx_bool* bThisReplicaExchanged)
-{
-    int i, j;
-    /* Hold the cyclic decomposition of the (multiple) replica
-     * exchange. */
-    gmx_bool bAnyReplicaExchanged = FALSE;
-    *bThisReplicaExchanged        = FALSE;
-
-    for (i = 0; i < re->nrepl; i++)
-    {
-        if (re->destinations[i] != re->ind[i])
-        {
-            /* only mark as exchanged if the index has been shuffled */
-            bAnyReplicaExchanged = TRUE;
-            break;
-        }
-    }
-    if (bAnyReplicaExchanged)
-    {
-        /* reinitialize the placeholder arrays */
-        for (i = 0; i < re->nrepl; i++)
-        {
-            for (j = 0; j < re->nrepl; j++)
-            {
-                re->cyclic[i][j] = -1;
-                re->order[i][j]  = -1;
-            }
-        }
-
-        /* Identify the cyclic decomposition of the permutation (very
-         * fast if neighbor replica exchange). */
-        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
-
-        /* Now translate the decomposition into a replica exchange
-         * order at each step. */
-        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
-
-        /* Did this replica do any exchange at any point? */
-        for (j = 0; j < *maxswap; j++)
-        {
-            if (replica_id != re->order[replica_id][j])
-            {
-                *bThisReplicaExchanged = TRUE;
-                break;
-            }
-        }
-    }
-}
-
-gmx_bool replica_exchange(FILE*                 fplog,
-                          const t_commrec*      cr,
-                          const gmx_multisim_t* ms,
-                          struct gmx_repl_ex*   re,
-                          t_state*              state,
-                          const gmx_enerdata_t* enerd,
-                          t_state*              state_local,
-                          int64_t               step,
-                          real                  time)
-{
-    int j;
-    int replica_id = 0;
-    int exchange_partner;
-    int maxswap = 0;
-    /* Number of rounds of exchanges needed to deal with any multiple
-     * exchanges. */
-    /* Where each replica ends up after the exchange attempt(s). */
-    /* The order in which multiple exchanges will occur. */
-    gmx_bool bThisReplicaExchanged = FALSE;
-
-    /* PLUMED */
-    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",nullptr);
-    /* END PLUMED */
-
-    if (MASTER(cr))
-    {
-        replica_id = re->repl;
-        test_for_replica_exchange(fplog, ms, re, enerd, det(state_local->box), step, time);
-        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
-    }
-    /* Do intra-simulation broadcast so all processors belonging to
-     * each simulation know whether they need to participate in
-     * collecting the state. Otherwise, they might as well get on with
-     * the next thing to do. */
-    if (DOMAINDECOMP(cr))
-    {
-#if GMX_MPI
-        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr), cr->mpi_comm_mygroup);
-#endif
-    }
-
-    if (bThisReplicaExchanged)
-    {
-        /* Exchange the states */
-        /* Collect the global state on the master node */
-        if (DOMAINDECOMP(cr))
-        {
-            dd_collect_state(cr->dd, state_local, state);
-        }
-        else
-        {
-            copy_state_serial(state_local, state);
-        }
-
-        if (MASTER(cr))
-        {
-            /* There will be only one swap cycle with standard replica
-             * exchange, but there may be multiple swap cycles if we
-             * allow multiple swaps. */
-
-            for (j = 0; j < maxswap; j++)
-            {
-                exchange_partner = re->order[replica_id][j];
-
-                if (exchange_partner != replica_id)
-                {
-                    /* Exchange the global states between the master nodes */
-                    if (debug)
-                    {
-                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
-                    }
-                    exchange_state(ms, exchange_partner, state);
-                }
-            }
-            /* For temperature-type replica exchange, we need to scale
-             * the velocities. */
-            if (re->type == ereTEMP || re->type == ereTL)
-            {
-                scale_velocities(state->v, std::sqrt(re->q[ereTEMP][replica_id]
-                                                     / re->q[ereTEMP][re->destinations[replica_id]]));
-            }
-        }
-
-        /* With domain decomposition the global state is distributed later */
-        if (!DOMAINDECOMP(cr))
-        {
-            /* Copy the global state to the local state data structure */
-            copy_state_serial(state, state_local);
-        }
-    }
-
-    return bThisReplicaExchanged;
-}
-
-void print_replica_exchange_statistics(FILE* fplog, struct gmx_repl_ex* re)
-{
-    int i;
-
-    fprintf(fplog, "\nReplica exchange statistics\n");
-
-    if (re->nex == 0)
-    {
-        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n", re->nattempt[0] + re->nattempt[1],
-                re->nattempt[1], re->nattempt[0]);
-
-        fprintf(fplog, "Repl  average probabilities:\n");
-        for (i = 1; i < re->nrepl; i++)
-        {
-            if (re->nattempt[i % 2] == 0)
-            {
-                re->prob[i] = 0;
-            }
-            else
-            {
-                re->prob[i] = re->prob_sum[i] / re->nattempt[i % 2];
-            }
-        }
-        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
-        print_prob(fplog, "", re->nrepl, re->prob);
-
-        fprintf(fplog, "Repl  number of exchanges:\n");
-        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
-        print_count(fplog, "", re->nrepl, re->nexchange);
-
-        fprintf(fplog, "Repl  average number of exchanges:\n");
-        for (i = 1; i < re->nrepl; i++)
-        {
-            if (re->nattempt[i % 2] == 0)
-            {
-                re->prob[i] = 0;
-            }
-            else
-            {
-                re->prob[i] = (static_cast<real>(re->nexchange[i])) / re->nattempt[i % 2];
-            }
-        }
-        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
-        print_prob(fplog, "", re->nrepl, re->prob);
-
-        fprintf(fplog, "\n");
-    }
-    /* print the transition matrix */
-    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
-}
-
-/* PLUMED HREX */
-int replica_exchange_get_repl(const gmx_repl_ex_t re){
-  return re->repl;
-};
-
-int replica_exchange_get_nrepl(const gmx_repl_ex_t re){
-  return re->nrepl;
-};
-/* END PLUMED HREX */
-//! \endcond
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed
deleted file mode 100644
index c40161d9ef..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed
+++ /dev/null
@@ -1,1389 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/*! \internal \file
- *
- * \brief Implements the replica exchange routines.
- *
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \author Mark Abraham <mark.j.abraham@gmail.com>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include "replicaexchange.h"
-
-#include "config.h"
-
-#include <cmath>
-
-#include <random>
-
-#include "gromacs/domdec/collect.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/math/units.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdrunutility/multisim.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/enerdata.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/random/threefry.h"
-#include "gromacs/random/uniformintdistribution.h"
-#include "gromacs/random/uniformrealdistribution.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/pleasecite.h"
-#include "gromacs/utility/smalloc.h"
-
-//! Helps cut off probability values.
-constexpr int c_probabilityCutoff = 100;
-
-/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
-
-//! Rank in the multisimulation
-#define MSRANK(ms, nodeid) (nodeid)
-
-//! Enum for replica exchange flavours
-enum
-{
-    ereTEMP,
-    ereLAMBDA,
-    ereENDSINGLE,
-    ereTL,
-    ereNR
-};
-/*! \brief Strings describing replica exchange flavours.
- *
- *  end_single_marker merely notes the end of single variable replica
- *  exchange. All types higher than it are multiple replica exchange
- *  methods.
- *
- * Eventually, should add 'pressure', 'temperature and pressure',
- *  'lambda_and_pressure', 'temperature_lambda_pressure'?; Let's wait
- *  until we feel better about the pressure control methods giving
- *  exact ensembles.  Right now, we assume constant pressure */
-static const char* erename[ereNR] = { "temperature", "lambda", "end_single_marker",
-                                      "temperature and lambda" };
-
-//! Working data for replica exchange.
-struct gmx_repl_ex
-{
-    //! Replica ID
-    int repl;
-    //! Total number of replica
-    int nrepl;
-    //! Temperature
-    real temp;
-    //! Replica exchange type from ere enum
-    int type;
-    //! Quantity, e.g. temperature or lambda; first index is ere, second index is replica ID
-    real** q;
-    //! Use constant pressure and temperature
-    gmx_bool bNPT;
-    //! Replica pressures
-    real* pres;
-    //! Replica indices
-    int* ind;
-    //! Used for keeping track of all the replica swaps
-    int* allswaps;
-    //! Replica exchange interval (number of steps)
-    int nst;
-    //! Number of exchanges per interval
-    int nex;
-    //! Random seed
-    int seed;
-    //! Number of even and odd replica change attempts
-    int nattempt[2];
-    //! Sum of probabilities
-    real* prob_sum;
-    //! Number of moves between replicas i and j
-    int** nmoves;
-    //! i-th element of the array is the number of exchanges between replica i-1 and i
-    int* nexchange;
-
-    /*! \brief Helper arrays for replica exchange; allocated here
-     * so they don't have to be allocated each time */
-    //! \{
-    int*      destinations;
-    int**     cyclic;
-    int**     order;
-    int*      tmpswap;
-    gmx_bool* incycle;
-    gmx_bool* bEx;
-    //! \}
-
-    //! Helper arrays to hold the quantities that are exchanged.
-    //! \{
-    real*  prob;
-    real*  Epot;
-    real*  beta;
-    real*  Vol;
-    real** de;
-    //! \}
-};
-
-// TODO We should add Doxygen here some time.
-//! \cond
-
-static gmx_bool repl_quantity(const gmx_multisim_t* ms, struct gmx_repl_ex* re, int ere, real q)
-{
-    real*    qall;
-    gmx_bool bDiff;
-    int      s;
-
-    snew(qall, ms->numSimulations_);
-    qall[re->repl] = q;
-    gmx_sum_sim(ms->numSimulations_, qall, ms);
-
-    bDiff = FALSE;
-    for (s = 1; s < ms->numSimulations_; s++)
-    {
-        if (qall[s] != qall[0])
-        {
-            bDiff = TRUE;
-        }
-    }
-
-    if (bDiff)
-    {
-        /* Set the replica exchange type and quantities */
-        re->type = ere;
-
-        snew(re->q[ere], re->nrepl);
-        for (s = 0; s < ms->numSimulations_; s++)
-        {
-            re->q[ere][s] = qall[s];
-        }
-    }
-    sfree(qall);
-    return bDiff;
-}
-
-gmx_repl_ex_t init_replica_exchange(FILE*                            fplog,
-                                    const gmx_multisim_t*            ms,
-                                    int                              numAtomsInSystem,
-                                    const t_inputrec*                ir,
-                                    const ReplicaExchangeParameters& replExParams)
-{
-    real                pres;
-    int                 i, j;
-    struct gmx_repl_ex* re;
-    gmx_bool            bTemp;
-    gmx_bool            bLambda = FALSE;
-
-    fprintf(fplog, "\nInitializing Replica Exchange\n");
-
-    if (!isMultiSim(ms) || ms->numSimulations_ == 1)
-    {
-        gmx_fatal(FARGS,
-                  "Nothing to exchange with only one replica, maybe you forgot to set the "
-                  "-multidir option of mdrun?");
-    }
-    if (replExParams.numExchanges < 0)
-    {
-        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
-    }
-
-    if (!EI_DYNAMICS(ir->eI))
-    {
-        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
-        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
-         * distinct from isMultiSim(ms). A multi-simulation only runs
-         * with real MPI parallelism, but this does not imply PAR(cr)
-         * is true!
-         *
-         * Since we are using a dynamical integrator, the only
-         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
-         * synonymous. The only way for cr->nnodes > 1 to be true is
-         * if we are using DD. */
-    }
-
-    snew(re, 1);
-
-    re->repl  = ms->simulationIndex_;
-    re->nrepl = ms->numSimulations_;
-    snew(re->q, ereENDSINGLE);
-
-    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
-
-    /* We only check that the number of atoms in the systms match.
-     * This, of course, do not guarantee that the systems are the same,
-     * but it does guarantee that we can perform replica exchange.
-     */
-    check_multi_int(fplog, ms, numAtomsInSystem, "the number of atoms", FALSE);
-    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
-    check_multi_int64(fplog, ms, ir->init_step + ir->nsteps, "init_step+nsteps", FALSE);
-    const int nst = replExParams.exchangeInterval;
-    check_multi_int64(fplog, ms, (ir->init_step + nst - 1) / nst,
-                      "first exchange step: init_step/-replex", FALSE);
-    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
-    check_multi_int(fplog, ms, ir->opts.ngtc, "the number of temperature coupling groups", FALSE);
-    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
-    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
-    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
-
-    re->temp = ir->opts.ref_t[0];
-    for (i = 1; (i < ir->opts.ngtc); i++)
-    {
-        if (ir->opts.ref_t[i] != re->temp)
-        {
-            fprintf(fplog,
-                    "\nWARNING: The temperatures of the different temperature coupling groups are "
-                    "not identical\n\n");
-            fprintf(stderr,
-                    "\nWARNING: The temperatures of the different temperature coupling groups are "
-                    "not identical\n\n");
-        }
-    }
-
-    re->type = -1;
-    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
-    if (ir->efep != efepNO)
-    {
-        bLambda = repl_quantity(ms, re, ereLAMBDA, static_cast<real>(ir->fepvals->init_fep_state));
-    }
-    if (re->type == -1) /* nothing was assigned */
-    {
-        gmx_fatal(FARGS,
-                  "The properties of the %d systems are all the same, there is nothing to exchange",
-                  re->nrepl);
-    }
-    if (bLambda && bTemp)
-    {
-        re->type = ereTL;
-    }
-
-    if (bTemp)
-    {
-        please_cite(fplog, "Sugita1999a");
-        if (ir->epc != epcNO)
-        {
-            re->bNPT = TRUE;
-            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
-            please_cite(fplog, "Okabe2001a");
-        }
-        if (ir->etc == etcBERENDSEN)
-        {
-            gmx_fatal(FARGS,
-                      "REMD with the %s thermostat does not produce correct potential energy "
-                      "distributions, consider using the %s thermostat instead",
-                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
-        }
-    }
-    if (bLambda)
-    {
-        if (ir->fepvals->delta_lambda != 0) /* check this? */
-        {
-            gmx_fatal(FARGS, "delta_lambda is not zero");
-        }
-    }
-    if (re->bNPT)
-    {
-        snew(re->pres, re->nrepl);
-        if (ir->epct == epctSURFACETENSION)
-        {
-            pres = ir->ref_p[ZZ][ZZ];
-        }
-        else
-        {
-            pres = 0;
-            j    = 0;
-            for (i = 0; i < DIM; i++)
-            {
-                if (ir->compress[i][i] != 0)
-                {
-                    pres += ir->ref_p[i][i];
-                    j++;
-                }
-            }
-            pres /= j;
-        }
-        re->pres[re->repl] = pres;
-        gmx_sum_sim(re->nrepl, re->pres, ms);
-    }
-
-    /* Make an index for increasing replica order */
-    /* only makes sense if one or the other is varying, not both!
-       if both are varying, we trust the order the person gave. */
-    snew(re->ind, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        re->ind[i] = i;
-    }
-
-    if (re->type < ereENDSINGLE)
-    {
-
-        for (i = 0; i < re->nrepl; i++)
-        {
-            for (j = i + 1; j < re->nrepl; j++)
-            {
-                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
-                {
-                    /* Unordered replicas are supposed to work, but there
-                     * is still an issues somewhere.
-                     * Note that at this point still re->ind[i]=i.
-                     */
-                    gmx_fatal(FARGS,
-                              "Replicas with indices %d < %d have %ss %g > %g, please order your "
-                              "replicas on increasing %s",
-                              i, j, erename[re->type], re->q[re->type][i], re->q[re->type][j],
-                              erename[re->type]);
-                }
-                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
-                {
-                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
-                }
-            }
-        }
-    }
-
-    /* keep track of all the swaps, starting with the initial placement. */
-    snew(re->allswaps, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        re->allswaps[i] = re->ind[i];
-    }
-
-    switch (re->type)
-    {
-        case ereTEMP:
-            fprintf(fplog, "\nReplica exchange in temperature\n");
-            for (i = 0; i < re->nrepl; i++)
-            {
-                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
-            }
-            fprintf(fplog, "\n");
-            break;
-        case ereLAMBDA:
-            fprintf(fplog, "\nReplica exchange in lambda\n");
-            for (i = 0; i < re->nrepl; i++)
-            {
-                fprintf(fplog, " %3d", static_cast<int>(re->q[re->type][re->ind[i]]));
-            }
-            fprintf(fplog, "\n");
-            break;
-        case ereTL:
-            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
-            for (i = 0; i < re->nrepl; i++)
-            {
-                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
-            }
-            fprintf(fplog, "\n");
-            for (i = 0; i < re->nrepl; i++)
-            {
-                fprintf(fplog, " %5d", static_cast<int>(re->q[ereLAMBDA][re->ind[i]]));
-            }
-            fprintf(fplog, "\n");
-            break;
-        default: gmx_incons("Unknown replica exchange quantity");
-    }
-    if (re->bNPT)
-    {
-        fprintf(fplog, "\nRepl  p");
-        for (i = 0; i < re->nrepl; i++)
-        {
-            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
-        }
-
-        for (i = 0; i < re->nrepl; i++)
-        {
-            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i - 1]]))
-            {
-                fprintf(fplog,
-                        "\nWARNING: The reference pressures decrease with increasing "
-                        "temperatures\n\n");
-                fprintf(stderr,
-                        "\nWARNING: The reference pressures decrease with increasing "
-                        "temperatures\n\n");
-            }
-        }
-    }
-    re->nst = nst;
-    if (replExParams.randomSeed == -1)
-    {
-        if (isMasterSim(ms))
-        {
-            re->seed = static_cast<int>(gmx::makeRandomSeed());
-        }
-        else
-        {
-            re->seed = 0;
-        }
-        gmx_sumi_sim(1, &(re->seed), ms);
-    }
-    else
-    {
-        re->seed = replExParams.randomSeed;
-    }
-    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
-    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
-
-    re->nattempt[0] = 0;
-    re->nattempt[1] = 0;
-
-    snew(re->prob_sum, re->nrepl);
-    snew(re->nexchange, re->nrepl);
-    snew(re->nmoves, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        snew(re->nmoves[i], re->nrepl);
-    }
-    fprintf(fplog, "Replica exchange information below: ex and x = exchange, pr = probability\n");
-
-    /* generate space for the helper functions so we don't have to snew each time */
-
-    snew(re->destinations, re->nrepl);
-    snew(re->incycle, re->nrepl);
-    snew(re->tmpswap, re->nrepl);
-    snew(re->cyclic, re->nrepl);
-    snew(re->order, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        snew(re->cyclic[i], re->nrepl + 1);
-        snew(re->order[i], re->nrepl);
-    }
-    /* allocate space for the functions storing the data for the replicas */
-    /* not all of these arrays needed in all cases, but they don't take
-       up much space, since the max size is nrepl**2 */
-    snew(re->prob, re->nrepl);
-    snew(re->bEx, re->nrepl);
-    snew(re->beta, re->nrepl);
-    snew(re->Vol, re->nrepl);
-    snew(re->Epot, re->nrepl);
-    snew(re->de, re->nrepl);
-    for (i = 0; i < re->nrepl; i++)
-    {
-        snew(re->de[i], re->nrepl);
-    }
-    re->nex = replExParams.numExchanges;
-    return re;
-}
-
-static void exchange_reals(const gmx_multisim_t gmx_unused* ms, int gmx_unused b, real* v, int n)
-{
-    real* buf;
-    int   i;
-
-    if (v)
-    {
-        snew(buf, n);
-#if GMX_MPI
-        /*
-           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-           ms->mastersComm_,MPI_STATUS_IGNORE);
-         */
-        {
-            MPI_Request mpi_req;
-
-            MPI_Isend(v, n * sizeof(real), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_, &mpi_req);
-            MPI_Recv(buf, n * sizeof(real), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_, MPI_STATUS_IGNORE);
-            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-        }
-#endif
-        for (i = 0; i < n; i++)
-        {
-            v[i] = buf[i];
-        }
-        sfree(buf);
-    }
-}
-
-
-static void exchange_doubles(const gmx_multisim_t gmx_unused* ms, int gmx_unused b, double* v, int n)
-{
-    double* buf;
-    int     i;
-
-    if (v)
-    {
-        snew(buf, n);
-#if GMX_MPI
-        /*
-           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-           ms->mastersComm_,MPI_STATUS_IGNORE);
-         */
-        {
-            MPI_Request mpi_req;
-
-            MPI_Isend(v, n * sizeof(double), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_, &mpi_req);
-            MPI_Recv(buf, n * sizeof(double), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_,
-                     MPI_STATUS_IGNORE);
-            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-        }
-#endif
-        for (i = 0; i < n; i++)
-        {
-            v[i] = buf[i];
-        }
-        sfree(buf);
-    }
-}
-
-static void exchange_rvecs(const gmx_multisim_t gmx_unused* ms, int gmx_unused b, rvec* v, int n)
-{
-    rvec* buf;
-    int   i;
-
-    if (v)
-    {
-        snew(buf, n);
-#if GMX_MPI
-        /*
-           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-           ms->mastersComm_,MPI_STATUS_IGNORE);
-         */
-        {
-            MPI_Request mpi_req;
-
-            MPI_Isend(v[0], n * sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_, &mpi_req);
-            MPI_Recv(buf[0], n * sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0, ms->mastersComm_,
-                     MPI_STATUS_IGNORE);
-            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-        }
-#endif
-        for (i = 0; i < n; i++)
-        {
-            copy_rvec(buf[i], v[i]);
-        }
-        sfree(buf);
-    }
-}
-
-static void exchange_state(const gmx_multisim_t* ms, int b, t_state* state)
-{
-    /* When t_state changes, this code should be updated. */
-    int ngtc, nnhpres;
-    ngtc    = state->ngtc * state->nhchainlength;
-    nnhpres = state->nnhpres * state->nhchainlength;
-    exchange_rvecs(ms, b, state->box, DIM);
-    exchange_rvecs(ms, b, state->box_rel, DIM);
-    exchange_rvecs(ms, b, state->boxv, DIM);
-    exchange_reals(ms, b, &(state->veta), 1);
-    exchange_reals(ms, b, &(state->vol0), 1);
-    exchange_rvecs(ms, b, state->svir_prev, DIM);
-    exchange_rvecs(ms, b, state->fvir_prev, DIM);
-    exchange_rvecs(ms, b, state->pres_prev, DIM);
-    exchange_doubles(ms, b, state->nosehoover_xi.data(), ngtc);
-    exchange_doubles(ms, b, state->nosehoover_vxi.data(), ngtc);
-    exchange_doubles(ms, b, state->nhpres_xi.data(), nnhpres);
-    exchange_doubles(ms, b, state->nhpres_vxi.data(), nnhpres);
-    exchange_doubles(ms, b, state->therm_integral.data(), state->ngtc);
-    exchange_doubles(ms, b, &state->baros_integral, 1);
-    exchange_rvecs(ms, b, state->x.rvec_array(), state->natoms);
-    exchange_rvecs(ms, b, state->v.rvec_array(), state->natoms);
-}
-
-static void copy_state_serial(const t_state* src, t_state* dest)
-{
-    if (dest != src)
-    {
-        /* Currently the local state is always a pointer to the global
-         * in serial, so we should never end up here.
-         * TODO: Implement a (trivial) t_state copy once converted to C++.
-         */
-        GMX_RELEASE_ASSERT(false, "State copying is currently not implemented in replica exchange");
-    }
-}
-
-static void scale_velocities(gmx::ArrayRef<gmx::RVec> velocities, real fac)
-{
-    for (auto& v : velocities)
-    {
-        v *= fac;
-    }
-}
-
-static void print_transition_matrix(FILE* fplog, int n, int** nmoves, const int* nattempt)
-{
-    int   i, j, ntot;
-    float Tprint;
-
-    ntot = nattempt[0] + nattempt[1];
-    fprintf(fplog, "\n");
-    fprintf(fplog, "Repl");
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "    "); /* put the title closer to the center */
-    }
-    fprintf(fplog, "Empirical Transition Matrix\n");
-
-    fprintf(fplog, "Repl");
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "%8d", (i + 1));
-    }
-    fprintf(fplog, "\n");
-
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "Repl");
-        for (j = 0; j < n; j++)
-        {
-            Tprint = 0.0;
-            if (nmoves[i][j] > 0)
-            {
-                Tprint = nmoves[i][j] / (2.0 * ntot);
-            }
-            fprintf(fplog, "%8.4f", Tprint);
-        }
-        fprintf(fplog, "%3d\n", i);
-    }
-}
-
-static void print_ind(FILE* fplog, const char* leg, int n, int* ind, const gmx_bool* bEx)
-{
-    int i;
-
-    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
-    for (i = 1; i < n; i++)
-    {
-        fprintf(fplog, " %c %2d", (bEx != nullptr && bEx[i]) ? 'x' : ' ', ind[i]);
-    }
-    fprintf(fplog, "\n");
-}
-
-static void print_allswitchind(FILE* fplog, int n, int* pind, int* allswaps, int* tmpswap)
-{
-    int i;
-
-    for (i = 0; i < n; i++)
-    {
-        tmpswap[i] = allswaps[i];
-    }
-    for (i = 0; i < n; i++)
-    {
-        allswaps[i] = tmpswap[pind[i]];
-    }
-
-    fprintf(fplog, "\nAccepted Exchanges:   ");
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "%d ", pind[i]);
-    }
-    fprintf(fplog, "\n");
-
-    /* the "Order After Exchange" is the state label corresponding to the configuration that
-       started in state listed in order, i.e.
-
-       3 0 1 2
-
-       means that the:
-       configuration starting in simulation 3 is now in simulation 0,
-       configuration starting in simulation 0 is now in simulation 1,
-       configuration starting in simulation 1 is now in simulation 2,
-       configuration starting in simulation 2 is now in simulation 3
-     */
-    fprintf(fplog, "Order After Exchange: ");
-    for (i = 0; i < n; i++)
-    {
-        fprintf(fplog, "%d ", allswaps[i]);
-    }
-    fprintf(fplog, "\n\n");
-}
-
-static void print_prob(FILE* fplog, const char* leg, int n, real* prob)
-{
-    int  i;
-    char buf[8];
-
-    fprintf(fplog, "Repl %2s ", leg);
-    for (i = 1; i < n; i++)
-    {
-        if (prob[i] >= 0)
-        {
-            sprintf(buf, "%4.2f", prob[i]);
-            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf + 1);
-        }
-        else
-        {
-            fprintf(fplog, "     ");
-        }
-    }
-    fprintf(fplog, "\n");
-}
-
-static void print_count(FILE* fplog, const char* leg, int n, int* count)
-{
-    int i;
-
-    fprintf(fplog, "Repl %2s ", leg);
-    for (i = 1; i < n; i++)
-    {
-        fprintf(fplog, " %4d", count[i]);
-    }
-    fprintf(fplog, "\n");
-}
-
-static real calc_delta(FILE* fplog, gmx_bool bPrint, struct gmx_repl_ex* re, int a, int b, int ap, int bp)
-{
-
-    real   ediff, dpV, delta = 0;
-    real*  Epot = re->Epot;
-    real*  Vol  = re->Vol;
-    real** de   = re->de;
-    real*  beta = re->beta;
-
-    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
-       to the non permuted case */
-
-    switch (re->type)
-    {
-        case ereTEMP:
-            /*
-             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
-             */
-            ediff = Epot[b] - Epot[a];
-            delta = -(beta[bp] - beta[ap]) * ediff;
-            break;
-        case ereLAMBDA:
-            /* two cases:  when we are permuted, and not.  */
-            /* non-permuted:
-               ediff =  E_new - E_old
-                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
-                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
-                     =  de[b][a] + de[a][b] */
-
-            /* permuted:
-               ediff =  E_new - E_old
-                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
-                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
-                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
-                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
-                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
-            /* but, in the current code implementation, we flip configurations, not indices . . .
-               So let's examine that.
-                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
-                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
-                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
-                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
-                     So the simple solution is to flip the
-                     position of perturbed and original indices in the tests.
-             */
-
-            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
-            delta = ediff * beta[a]; /* assume all same temperature in this case */
-            break;
-        case ereTL:
-            /* not permuted:  */
-            /* delta =  reduced E_new - reduced E_old
-                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
-                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
-                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
-                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
-                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
-                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
-                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
-            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
-            /* permuted (big breath!) */
-            /*   delta =  reduced E_new - reduced E_old
-                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
-                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
-                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
-                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
-             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
-                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
-             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
-                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
-             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
-            delta = beta[bp] * (de[bp][a] - de[bp][b]) + beta[ap] * (de[ap][b] - de[ap][a])
-                    - (beta[bp] - beta[ap]) * (Epot[b] - Epot[a]);
-            break;
-        default: gmx_incons("Unknown replica exchange quantity");
-    }
-    if (bPrint)
-    {
-        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
-    }
-    if (re->bNPT)
-    {
-        /* revist the calculation for 5.0.  Might be some improvements. */
-        dpV = (beta[ap] * re->pres[ap] - beta[bp] * re->pres[bp]) * (Vol[b] - Vol[a]) / PRESFAC;
-        if (bPrint)
-        {
-            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
-        }
-        delta += dpV;
-    }
-    return delta;
-}
-
-static void test_for_replica_exchange(FILE*                 fplog,
-                                      const gmx_multisim_t* ms,
-                                      struct gmx_repl_ex*   re,
-                                      const gmx_enerdata_t* enerd,
-                                      real                  vol,
-                                      int64_t               step,
-                                      real                  time)
-{
-    int                                m, i, j, a, b, ap, bp, i0, i1, tmp;
-    real                               delta = 0;
-    gmx_bool                           bPrint, bMultiEx;
-    gmx_bool*                          bEx      = re->bEx;
-    real*                              prob     = re->prob;
-    int*                               pind     = re->destinations; /* permuted index */
-    gmx_bool                           bEpot    = FALSE;
-    gmx_bool                           bDLambda = FALSE;
-    gmx_bool                           bVol     = FALSE;
-    gmx::ThreeFry2x64<64>              rng(re->seed, gmx::RandomDomain::ReplicaExchange);
-    gmx::UniformRealDistribution<real> uniformRealDist;
-    gmx::UniformIntDistribution<int>   uniformNreplDist(0, re->nrepl - 1);
-
-    bMultiEx = (re->nex > 1); /* multiple exchanges at each state */
-    fprintf(fplog, "Replica exchange at step %" PRId64 " time %.5f\n", step, time);
-
-    if (re->bNPT)
-    {
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->Vol[i] = 0;
-        }
-        bVol              = TRUE;
-        re->Vol[re->repl] = vol;
-    }
-    if ((re->type == ereTEMP || re->type == ereTL))
-    {
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->Epot[i] = 0;
-        }
-        bEpot              = TRUE;
-        re->Epot[re->repl] = enerd->term[F_EPOT];
-        /* temperatures of different states*/
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->beta[i] = 1.0 / (re->q[ereTEMP][i] * BOLTZ);
-        }
-    }
-    else
-    {
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->beta[i] = 1.0 / (re->temp * BOLTZ); /* we have a single temperature */
-        }
-    }
-    if (re->type == ereLAMBDA || re->type == ereTL)
-    {
-        bDLambda = TRUE;
-        /* lambda differences. */
-        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
-           minus the energy of the jth simulation in the jth Hamiltonian */
-        for (i = 0; i < re->nrepl; i++)
-        {
-            for (j = 0; j < re->nrepl; j++)
-            {
-                re->de[i][j] = 0;
-            }
-        }
-        for (i = 0; i < re->nrepl; i++)
-        {
-            re->de[i][re->repl] = enerd->foreignLambdaTerms.deltaH(re->q[ereLAMBDA][i]);
-        }
-    }
-
-    /* now actually do the communication */
-    if (bVol)
-    {
-        gmx_sum_sim(re->nrepl, re->Vol, ms);
-    }
-    if (bEpot)
-    {
-        gmx_sum_sim(re->nrepl, re->Epot, ms);
-    }
-    if (bDLambda)
-    {
-        for (i = 0; i < re->nrepl; i++)
-        {
-            gmx_sum_sim(re->nrepl, re->de[i], ms);
-        }
-    }
-
-    /* make a duplicate set of indices for shuffling */
-    for (i = 0; i < re->nrepl; i++)
-    {
-        pind[i] = re->ind[i];
-    }
-
-    rng.restart(step, 0);
-
-    if (bMultiEx)
-    {
-        /* multiple random switch exchange */
-        int nself = 0;
-
-
-        for (i = 0; i < re->nex + nself; i++)
-        {
-            // For now this is superfluous, but just in case we ever add more
-            // calls in different branches it is safer to always reset the distribution.
-            uniformNreplDist.reset();
-
-            /* randomly select a pair  */
-            /* in theory, could reduce this by identifying only which switches had a nonneglibible
-               probability of occurring (log p > -100) and only operate on those switches */
-            /* find out which state it is from, and what label that state currently has. Likely
-               more work that useful. */
-            i0 = uniformNreplDist(rng);
-            i1 = uniformNreplDist(rng);
-            if (i0 == i1)
-            {
-                nself++;
-                continue; /* self-exchange, back up and do it again */
-            }
-
-            a  = re->ind[i0]; /* what are the indices of these states? */
-            b  = re->ind[i1];
-            ap = pind[i0];
-            bp = pind[i1];
-
-            bPrint = FALSE; /* too noisy */
-            /* calculate the energy difference */
-            /* if the code changes to flip the STATES, rather than the configurations,
-               use the commented version of the code */
-            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
-            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
-
-            /* we actually only use the first space in the prob and bEx array,
-               since there are actually many switches between pairs. */
-
-            if (delta <= 0)
-            {
-                /* accepted */
-                prob[0] = 1;
-                bEx[0]  = TRUE;
-            }
-            else
-            {
-                if (delta > c_probabilityCutoff)
-                {
-                    prob[0] = 0;
-                }
-                else
-                {
-                    prob[0] = exp(-delta);
-                }
-                // roll a number to determine if accepted. For now it is superfluous to
-                // reset, but just in case we ever add more calls in different branches
-                // it is safer to always reset the distribution.
-                uniformRealDist.reset();
-                bEx[0] = uniformRealDist(rng) < prob[0];
-            }
-            re->prob_sum[0] += prob[0];
-
-            if (bEx[0])
-            {
-                /* swap the states */
-                tmp      = pind[i0];
-                pind[i0] = pind[i1];
-                pind[i1] = tmp;
-            }
-        }
-        re->nattempt[0]++; /* keep track of total permutation trials here */
-        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
-    }
-    else
-    {
-        /* standard nearest neighbor replica exchange */
-
-        m = (step / re->nst) % 2;
-        for (i = 1; i < re->nrepl; i++)
-        {
-            a = re->ind[i - 1];
-            b = re->ind[i];
-
-            bPrint = (re->repl == a || re->repl == b);
-            if (i % 2 == m)
-            {
-                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-                if (delta <= 0)
-                {
-                    /* accepted */
-                    prob[i] = 1;
-                    bEx[i]  = TRUE;
-                }
-                else
-                {
-                    if (delta > c_probabilityCutoff)
-                    {
-                        prob[i] = 0;
-                    }
-                    else
-                    {
-                        prob[i] = exp(-delta);
-                    }
-                    // roll a number to determine if accepted. For now it is superfluous to
-                    // reset, but just in case we ever add more calls in different branches
-                    // it is safer to always reset the distribution.
-                    uniformRealDist.reset();
-                    bEx[i] = uniformRealDist(rng) < prob[i];
-                }
-                re->prob_sum[i] += prob[i];
-
-                if (bEx[i])
-                {
-                    /* swap these two */
-                    tmp         = pind[i - 1];
-                    pind[i - 1] = pind[i];
-                    pind[i]     = tmp;
-                    re->nexchange[i]++; /* statistics for back compatibility */
-                }
-            }
-            else
-            {
-                prob[i] = -1;
-                bEx[i]  = FALSE;
-            }
-        }
-        /* print some statistics */
-        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
-        print_prob(fplog, "pr", re->nrepl, prob);
-        fprintf(fplog, "\n");
-        re->nattempt[m]++;
-    }
-
-    /* record which moves were made and accepted */
-    for (i = 0; i < re->nrepl; i++)
-    {
-        re->nmoves[re->ind[i]][pind[i]] += 1;
-        re->nmoves[pind[i]][re->ind[i]] += 1;
-    }
-    fflush(fplog); /* make sure we can see what the last exchange was */
-}
-
-static void cyclic_decomposition(const int* destinations, int** cyclic, gmx_bool* incycle, const int nrepl, int* nswap)
-{
-
-    int i, j, c, p;
-    int maxlen = 1;
-    for (i = 0; i < nrepl; i++)
-    {
-        incycle[i] = FALSE;
-    }
-    for (i = 0; i < nrepl; i++) /* one cycle for each replica */
-    {
-        if (incycle[i])
-        {
-            cyclic[i][0] = -1;
-            continue;
-        }
-        cyclic[i][0] = i;
-        incycle[i]   = TRUE;
-        c            = 1;
-        p            = i;
-        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
-        {
-            p = destinations[p]; /* start permuting */
-            if (p == i)
-            {
-                cyclic[i][c] = -1;
-                if (c > maxlen)
-                {
-                    maxlen = c;
-                }
-                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
-            }
-            else
-            {
-                cyclic[i][c] = p; /* each permutation gives a new member of the cycle */
-                incycle[p]   = TRUE;
-                c++;
-            }
-        }
-    }
-    *nswap = maxlen - 1;
-
-    if (debug)
-    {
-        for (i = 0; i < nrepl; i++)
-        {
-            fprintf(debug, "Cycle %d:", i);
-            for (j = 0; j < nrepl; j++)
-            {
-                if (cyclic[i][j] < 0)
-                {
-                    break;
-                }
-                fprintf(debug, "%2d", cyclic[i][j]);
-            }
-            fprintf(debug, "\n");
-        }
-        fflush(debug);
-    }
-}
-
-static void compute_exchange_order(int** cyclic, int** order, const int nrepl, const int maxswap)
-{
-    int i, j;
-
-    for (j = 0; j < maxswap; j++)
-    {
-        for (i = 0; i < nrepl; i++)
-        {
-            if (cyclic[i][j + 1] >= 0)
-            {
-                order[cyclic[i][j + 1]][j] = cyclic[i][j];
-                order[cyclic[i][j]][j]     = cyclic[i][j + 1];
-            }
-        }
-        for (i = 0; i < nrepl; i++)
-        {
-            if (order[i][j] < 0)
-            {
-                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
-            }
-        }
-    }
-
-    if (debug)
-    {
-        fprintf(debug, "Replica Exchange Order\n");
-        for (i = 0; i < nrepl; i++)
-        {
-            fprintf(debug, "Replica %d:", i);
-            for (j = 0; j < maxswap; j++)
-            {
-                if (order[i][j] < 0)
-                {
-                    break;
-                }
-                fprintf(debug, "%2d", order[i][j]);
-            }
-            fprintf(debug, "\n");
-        }
-        fflush(debug);
-    }
-}
-
-static void prepare_to_do_exchange(struct gmx_repl_ex* re, const int replica_id, int* maxswap, gmx_bool* bThisReplicaExchanged)
-{
-    int i, j;
-    /* Hold the cyclic decomposition of the (multiple) replica
-     * exchange. */
-    gmx_bool bAnyReplicaExchanged = FALSE;
-    *bThisReplicaExchanged        = FALSE;
-
-    for (i = 0; i < re->nrepl; i++)
-    {
-        if (re->destinations[i] != re->ind[i])
-        {
-            /* only mark as exchanged if the index has been shuffled */
-            bAnyReplicaExchanged = TRUE;
-            break;
-        }
-    }
-    if (bAnyReplicaExchanged)
-    {
-        /* reinitialize the placeholder arrays */
-        for (i = 0; i < re->nrepl; i++)
-        {
-            for (j = 0; j < re->nrepl; j++)
-            {
-                re->cyclic[i][j] = -1;
-                re->order[i][j]  = -1;
-            }
-        }
-
-        /* Identify the cyclic decomposition of the permutation (very
-         * fast if neighbor replica exchange). */
-        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
-
-        /* Now translate the decomposition into a replica exchange
-         * order at each step. */
-        compute_exchange_order(re->cyclic, re->order, re->nrepl, *maxswap);
-
-        /* Did this replica do any exchange at any point? */
-        for (j = 0; j < *maxswap; j++)
-        {
-            if (replica_id != re->order[replica_id][j])
-            {
-                *bThisReplicaExchanged = TRUE;
-                break;
-            }
-        }
-    }
-}
-
-gmx_bool replica_exchange(FILE*                 fplog,
-                          const t_commrec*      cr,
-                          const gmx_multisim_t* ms,
-                          struct gmx_repl_ex*   re,
-                          t_state*              state,
-                          const gmx_enerdata_t* enerd,
-                          t_state*              state_local,
-                          int64_t               step,
-                          real                  time)
-{
-    int j;
-    int replica_id = 0;
-    int exchange_partner;
-    int maxswap = 0;
-    /* Number of rounds of exchanges needed to deal with any multiple
-     * exchanges. */
-    /* Where each replica ends up after the exchange attempt(s). */
-    /* The order in which multiple exchanges will occur. */
-    gmx_bool bThisReplicaExchanged = FALSE;
-
-    if (MASTER(cr))
-    {
-        replica_id = re->repl;
-        test_for_replica_exchange(fplog, ms, re, enerd, det(state_local->box), step, time);
-        prepare_to_do_exchange(re, replica_id, &maxswap, &bThisReplicaExchanged);
-    }
-    /* Do intra-simulation broadcast so all processors belonging to
-     * each simulation know whether they need to participate in
-     * collecting the state. Otherwise, they might as well get on with
-     * the next thing to do. */
-    if (DOMAINDECOMP(cr))
-    {
-#if GMX_MPI
-        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr), cr->mpi_comm_mygroup);
-#endif
-    }
-
-    if (bThisReplicaExchanged)
-    {
-        /* Exchange the states */
-        /* Collect the global state on the master node */
-        if (DOMAINDECOMP(cr))
-        {
-            dd_collect_state(cr->dd, state_local, state);
-        }
-        else
-        {
-            copy_state_serial(state_local, state);
-        }
-
-        if (MASTER(cr))
-        {
-            /* There will be only one swap cycle with standard replica
-             * exchange, but there may be multiple swap cycles if we
-             * allow multiple swaps. */
-
-            for (j = 0; j < maxswap; j++)
-            {
-                exchange_partner = re->order[replica_id][j];
-
-                if (exchange_partner != replica_id)
-                {
-                    /* Exchange the global states between the master nodes */
-                    if (debug)
-                    {
-                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
-                    }
-                    exchange_state(ms, exchange_partner, state);
-                }
-            }
-            /* For temperature-type replica exchange, we need to scale
-             * the velocities. */
-            if (re->type == ereTEMP || re->type == ereTL)
-            {
-                scale_velocities(state->v, std::sqrt(re->q[ereTEMP][replica_id]
-                                                     / re->q[ereTEMP][re->destinations[replica_id]]));
-            }
-        }
-
-        /* With domain decomposition the global state is distributed later */
-        if (!DOMAINDECOMP(cr))
-        {
-            /* Copy the global state to the local state data structure */
-            copy_state_serial(state, state_local);
-        }
-    }
-
-    return bThisReplicaExchanged;
-}
-
-void print_replica_exchange_statistics(FILE* fplog, struct gmx_repl_ex* re)
-{
-    int i;
-
-    fprintf(fplog, "\nReplica exchange statistics\n");
-
-    if (re->nex == 0)
-    {
-        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n", re->nattempt[0] + re->nattempt[1],
-                re->nattempt[1], re->nattempt[0]);
-
-        fprintf(fplog, "Repl  average probabilities:\n");
-        for (i = 1; i < re->nrepl; i++)
-        {
-            if (re->nattempt[i % 2] == 0)
-            {
-                re->prob[i] = 0;
-            }
-            else
-            {
-                re->prob[i] = re->prob_sum[i] / re->nattempt[i % 2];
-            }
-        }
-        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
-        print_prob(fplog, "", re->nrepl, re->prob);
-
-        fprintf(fplog, "Repl  number of exchanges:\n");
-        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
-        print_count(fplog, "", re->nrepl, re->nexchange);
-
-        fprintf(fplog, "Repl  average number of exchanges:\n");
-        for (i = 1; i < re->nrepl; i++)
-        {
-            if (re->nattempt[i % 2] == 0)
-            {
-                re->prob[i] = 0;
-            }
-            else
-            {
-                re->prob[i] = (static_cast<real>(re->nexchange[i])) / re->nattempt[i % 2];
-            }
-        }
-        print_ind(fplog, "", re->nrepl, re->ind, nullptr);
-        print_prob(fplog, "", re->nrepl, re->prob);
-
-        fprintf(fplog, "\n");
-    }
-    /* print the transition matrix */
-    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
-}
-
-//! \endcond
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.h b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.h
deleted file mode 100644
index 108632d94d..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014,2015 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \libinternal \file
- *
- * \brief Declares the routines for replica exchange.
- *
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \author Mark Abraham <mark.j.abraham@gmail.com>
- *
- * \ingroup module_mdrun
- */
-#ifndef GMX_MDRUN_REPLICAEXCHANGE_H
-#define GMX_MDRUN_REPLICAEXCHANGE_H
-
-#include <cstdio>
-
-#include "gromacs/utility/basedefinitions.h"
-#include "gromacs/utility/real.h"
-
-struct gmx_enerdata_t;
-struct gmx_multisim_t;
-struct t_commrec;
-struct t_inputrec;
-class t_state;
-
-/*! \libinternal
- * \brief The parameters for the replica exchange algorithm. */
-struct ReplicaExchangeParameters
-{
-    //! Interval in steps at which to attempt exchanges, 0 means no replica exchange.
-    int exchangeInterval = 0;
-    //! The number of exchanges to attempt at an exchange step.
-    int numExchanges = 0;
-    //! The random seed, -1 means generate a seed.
-    int randomSeed = -1;
-};
-
-//! Abstract type for replica exchange
-typedef struct gmx_repl_ex* gmx_repl_ex_t;
-
-/*! \brief Setup function.
- *
- * Should only be called on the master ranks */
-gmx_repl_ex_t init_replica_exchange(FILE*                            fplog,
-                                    const gmx_multisim_t*            ms,
-                                    int                              numAtomsInSystem,
-                                    const t_inputrec*                ir,
-                                    const ReplicaExchangeParameters& replExParams);
-
-/*! \brief Attempts replica exchange.
- *
- * Should be called on all ranks.  When running each replica in
- * parallel, this routine collects the state on the master rank before
- * exchange.  With domain decomposition, the global state after
- * exchange is stored in state and still needs to be redistributed
- * over the ranks.
- *
- * \returns TRUE if the state has been exchanged.
- */
-gmx_bool replica_exchange(FILE*                 fplog,
-                          const t_commrec*      cr,
-                          const gmx_multisim_t* ms,
-                          gmx_repl_ex_t         re,
-                          t_state*              state,
-                          const gmx_enerdata_t* enerd,
-                          t_state*              state_local,
-                          int64_t               step,
-                          real                  time);
-
-/*! \brief Prints replica exchange statistics to the log file.
- *
- * Should only be called on the master ranks */
-void print_replica_exchange_statistics(FILE* fplog, gmx_repl_ex_t re);
-
-/* PLUMED HREX */
-extern int replica_exchange_get_repl(const gmx_repl_ex_t re);
-extern int replica_exchange_get_nrepl(const gmx_repl_ex_t re);
-extern void pd_collect_state(const t_commrec *cr, t_state *state);
-extern void exchange_state(const gmx_multisim_t *ms, int b, t_state *state);
-extern void copy_state_serial(const t_state *src, t_state *dest);
-/* END PLUMED HREX */
-
-#endif
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.h.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.h.preplumed
deleted file mode 100644
index e8cb9bdce3..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/replicaexchange.h.preplumed
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011,2012,2013,2014,2015 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \libinternal \file
- *
- * \brief Declares the routines for replica exchange.
- *
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \author Mark Abraham <mark.j.abraham@gmail.com>
- *
- * \ingroup module_mdrun
- */
-#ifndef GMX_MDRUN_REPLICAEXCHANGE_H
-#define GMX_MDRUN_REPLICAEXCHANGE_H
-
-#include <cstdio>
-
-#include "gromacs/utility/basedefinitions.h"
-#include "gromacs/utility/real.h"
-
-struct gmx_enerdata_t;
-struct gmx_multisim_t;
-struct t_commrec;
-struct t_inputrec;
-class t_state;
-
-/*! \libinternal
- * \brief The parameters for the replica exchange algorithm. */
-struct ReplicaExchangeParameters
-{
-    //! Interval in steps at which to attempt exchanges, 0 means no replica exchange.
-    int exchangeInterval = 0;
-    //! The number of exchanges to attempt at an exchange step.
-    int numExchanges = 0;
-    //! The random seed, -1 means generate a seed.
-    int randomSeed = -1;
-};
-
-//! Abstract type for replica exchange
-typedef struct gmx_repl_ex* gmx_repl_ex_t;
-
-/*! \brief Setup function.
- *
- * Should only be called on the master ranks */
-gmx_repl_ex_t init_replica_exchange(FILE*                            fplog,
-                                    const gmx_multisim_t*            ms,
-                                    int                              numAtomsInSystem,
-                                    const t_inputrec*                ir,
-                                    const ReplicaExchangeParameters& replExParams);
-
-/*! \brief Attempts replica exchange.
- *
- * Should be called on all ranks.  When running each replica in
- * parallel, this routine collects the state on the master rank before
- * exchange.  With domain decomposition, the global state after
- * exchange is stored in state and still needs to be redistributed
- * over the ranks.
- *
- * \returns TRUE if the state has been exchanged.
- */
-gmx_bool replica_exchange(FILE*                 fplog,
-                          const t_commrec*      cr,
-                          const gmx_multisim_t* ms,
-                          gmx_repl_ex_t         re,
-                          t_state*              state,
-                          const gmx_enerdata_t* enerd,
-                          t_state*              state_local,
-                          int64_t               step,
-                          real                  time);
-
-/*! \brief Prints replica exchange statistics to the log file.
- *
- * Should only be called on the master ranks */
-void print_replica_exchange_statistics(FILE* fplog, gmx_repl_ex_t re);
-
-#endif
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/rerun.cpp b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/rerun.cpp
deleted file mode 100644
index e1de8813ef..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/rerun.cpp
+++ /dev/null
@@ -1,805 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief Implements the loop for simulation reruns
- *
- * \author Pascal Merz <pascal.merz@colorado.edu>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-
-#include <algorithm>
-#include <memory>
-
-#include "gromacs/applied_forces/awh/awh.h"
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/domdec/collect.h"
-#include "gromacs/domdec/dlbtiming.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_network.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/mdsetup.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/essentialdynamics/edsam.h"
-#include "gromacs/ewald/pme_load_balancing.h"
-#include "gromacs/ewald/pme_pp.h"
-#include "gromacs/fileio/trxio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gpu_utils/gpu_utils.h"
-#include "gromacs/math/units.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/utilities.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/math/vectypes.h"
-#include "gromacs/mdlib/checkpointhandler.h"
-#include "gromacs/mdlib/compute_io.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/ebin.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/energyoutput.h"
-#include "gromacs/mdlib/expanded.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/freeenergyparameters.h"
-#include "gromacs/mdlib/md_support.h"
-#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/mdoutf.h"
-#include "gromacs/mdlib/membed.h"
-#include "gromacs/mdlib/resethandler.h"
-#include "gromacs/mdlib/sighandler.h"
-#include "gromacs/mdlib/simulationsignal.h"
-#include "gromacs/mdlib/stat.h"
-#include "gromacs/mdlib/stophandler.h"
-#include "gromacs/mdlib/tgroup.h"
-#include "gromacs/mdlib/trajectory_writing.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/vcm.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdrunutility/handlerestart.h"
-#include "gromacs/mdrunutility/multisim.h"
-#include "gromacs/mdrunutility/printtime.h"
-#include "gromacs/mdtypes/awh_history.h"
-#include "gromacs/mdtypes/awh_params.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/df_history.h"
-#include "gromacs/mdtypes/energyhistory.h"
-#include "gromacs/mdtypes/forcebuffers.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/group.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/interaction_const.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-#include "gromacs/mdtypes/observableshistory.h"
-#include "gromacs/mdtypes/simulation_workload.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/mimic/utilities.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pulling/output.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/swap/swapcoords.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/walltime_accounting.h"
-#include "gromacs/topology/atoms.h"
-#include "gromacs/topology/idef.h"
-#include "gromacs/topology/mtop_util.h"
-#include "gromacs/topology/topology.h"
-#include "gromacs/trajectory/trajectoryframe.h"
-#include "gromacs/utility/basedefinitions.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/real.h"
-
-#include "legacysimulator.h"
-#include "replicaexchange.h"
-#include "shellfc.h"
-
-/* PLUMED */
-#include "../../../Plumed.h"
-extern int    plumedswitch;
-extern plumed plumedmain;
-/* END PLUMED */
-
-using gmx::SimulationSignaller;
-using gmx::VirtualSitesHandler;
-
-/*! \brief Copy the state from \p rerunFrame to \p globalState and, if requested, construct vsites
- *
- * \param[in]     rerunFrame      The trajectory frame to compute energy/forces for
- * \param[in,out] globalState     The global state container
- * \param[in]     constructVsites When true, vsite coordinates are constructed
- * \param[in]     vsite           Vsite setup, can be nullptr when \p constructVsites = false
- * \param[in]     timeStep        Time step, used for constructing vsites
- */
-static void prepareRerunState(const t_trxframe&          rerunFrame,
-                              t_state*                   globalState,
-                              bool                       constructVsites,
-                              const VirtualSitesHandler* vsite,
-                              double                     timeStep)
-{
-    auto x      = makeArrayRef(globalState->x);
-    auto rerunX = arrayRefFromArray(reinterpret_cast<gmx::RVec*>(rerunFrame.x), globalState->natoms);
-    std::copy(rerunX.begin(), rerunX.end(), x.begin());
-    copy_mat(rerunFrame.box, globalState->box);
-
-    if (constructVsites)
-    {
-        GMX_ASSERT(vsite, "Need valid vsite for constructing vsites");
-
-        vsite->construct(globalState->x, timeStep, globalState->v, globalState->box);
-    }
-}
-
-void gmx::LegacySimulator::do_rerun()
-{
-    // TODO Historically, the EM and MD "integrators" used different
-    // names for the t_inputrec *parameter, but these must have the
-    // same name, now that it's a member of a struct. We use this ir
-    // alias to avoid a large ripple of nearly useless changes.
-    // t_inputrec is being replaced by IMdpOptionsProvider, so this
-    // will go away eventually.
-    t_inputrec*       ir = inputrec;
-    int64_t           step, step_rel;
-    double            t;
-    bool              isLastStep               = false;
-    bool              doFreeEnergyPerturbation = false;
-    unsigned int      force_flags;
-    tensor            force_vir, shake_vir, total_vir, pres;
-    t_trxstatus*      status = nullptr;
-    rvec              mu_tot;
-    t_trxframe        rerun_fr;
-    gmx_localtop_t    top(top_global->ffparams);
-    ForceBuffers      f;
-    gmx_global_stat_t gstat;
-    gmx_shellfc_t*    shellfc;
-
-    double cycles;
-
-    /* PLUMED */
-    int plumedNeedsEnergy=0;
-    int plumedWantsToStop=0;
-    matrix plumed_vir;
-    real lambdaForce=0;
-    real realFepState=0;
-    /* END PLUMED */
-
-    /* Domain decomposition could incorrectly miss a bonded
-       interaction, but checking for that requires a global
-       communication stage, which does not otherwise happen in DD
-       code. So we do that alongside the first global energy reduction
-       after a new DD is made. These variables handle whether the
-       check happens, and the result it returns. */
-    bool shouldCheckNumberOfBondedInteractions = false;
-    int  totalNumberOfBondedInteractions       = -1;
-
-    SimulationSignals signals;
-    // Most global communnication stages don't propagate mdrun
-    // signals, and will use this object to achieve that.
-    SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that it is planned that the command gmx mdrun -rerun will "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx rerun -f.");
-
-    if (ir->efep != efepNO
-        && (mdAtoms->mdatoms()->nMassPerturbed > 0 || (constr && constr->havePerturbedConstraints())))
-    {
-        gmx_fatal(FARGS,
-                  "Perturbed masses or constraints are not supported by rerun. "
-                  "Either make a .tpr without mass and constraint perturbation, "
-                  "or use GROMACS 2018.4, 2018.5 or later 2018 version.");
-    }
-    if (ir->bExpanded)
-    {
-        gmx_fatal(FARGS, "Expanded ensemble not supported by rerun.");
-    }
-    if (ir->bSimTemp)
-    {
-        gmx_fatal(FARGS, "Simulated tempering not supported by rerun.");
-    }
-    if (ir->bDoAwh)
-    {
-        gmx_fatal(FARGS, "AWH not supported by rerun.");
-    }
-    if (replExParams.exchangeInterval > 0)
-    {
-        gmx_fatal(FARGS, "Replica exchange not supported by rerun.");
-    }
-    if (opt2bSet("-ei", nfile, fnm) || observablesHistory->edsamHistory != nullptr)
-    {
-        gmx_fatal(FARGS, "Essential dynamics not supported by rerun.");
-    }
-    if (ir->bIMD)
-    {
-        gmx_fatal(FARGS, "Interactive MD not supported by rerun.");
-    }
-    if (isMultiSim(ms))
-    {
-        gmx_fatal(FARGS, "Multiple simulations not supported by rerun.");
-    }
-    if (std::any_of(ir->opts.annealing, ir->opts.annealing + ir->opts.ngtc,
-                    [](int i) { return i != eannNO; }))
-    {
-        gmx_fatal(FARGS, "Simulated annealing not supported by rerun.");
-    }
-
-    /* Rerun can't work if an output file name is the same as the input file name.
-     * If this is the case, the user will get an error telling them what the issue is.
-     */
-    if (strcmp(opt2fn("-rerun", nfile, fnm), opt2fn("-o", nfile, fnm)) == 0
-        || strcmp(opt2fn("-rerun", nfile, fnm), opt2fn("-x", nfile, fnm)) == 0)
-    {
-        gmx_fatal(FARGS,
-                  "When using mdrun -rerun, the name of the input trajectory file "
-                  "%s cannot be identical to the name of an output file (whether "
-                  "given explicitly with -o or -x, or by default)",
-                  opt2fn("-rerun", nfile, fnm));
-    }
-
-    /* Settings for rerun */
-    ir->nstlist              = 1;
-    ir->nstcalcenergy        = 1;
-    int        nstglobalcomm = 1;
-    const bool bNS           = true;
-
-    ir->nstxout_compressed         = 0;
-    const SimulationGroups* groups = &top_global->groups;
-    if (ir->eI == eiMimic)
-    {
-        auto nonConstGlobalTopology                          = const_cast<gmx_mtop_t*>(top_global);
-        nonConstGlobalTopology->intermolecularExclusionGroup = genQmmmIndices(*top_global);
-    }
-    int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
-    gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
-    initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
-    const bool        simulationsShareState = false;
-    gmx_mdoutf*       outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, ir, top_global, oenv, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
-                                   mdoutf_get_fp_dhdl(outf), true, StartingBehavior::NewSimulation,
-                                   simulationsShareState, mdModulesNotifier);
-
-    gstat = global_stat_init(ir);
-
-    /* Check for polarizable models and flexible constraints */
-    shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
-                                 ir->nstcalcenergy, DOMAINDECOMP(cr),
-                                 runScheduleWork->simulationWork.useGpuPme);
-
-    {
-        double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
-        if ((io > 2000) && MASTER(cr))
-        {
-            fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
-        }
-    }
-
-    // Local state only becomes valid now.
-    std::unique_ptr<t_state> stateInstance;
-    t_state*                 state;
-
-    if (DOMAINDECOMP(cr))
-    {
-        stateInstance = std::make_unique<t_state>();
-        state         = stateInstance.get();
-        dd_init_local_state(cr->dd, state_global, state);
-
-        /* Distribute the charge groups over the nodes from the master node */
-        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
-                            imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
-                            nrnb, nullptr, FALSE);
-        shouldCheckNumberOfBondedInteractions = true;
-    }
-    else
-    {
-        state_change_natoms(state_global, state_global->natoms);
-        /* Copy the pointer to the global state */
-        state = state_global;
-
-        mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
-    }
-
-    auto mdatoms = mdAtoms->mdatoms();
-
-    // NOTE: The global state is no longer used at this point.
-    // But state_global is still used as temporary storage space for writing
-    // the global state to file and potentially for replica exchange.
-    // (Global topology should persist.)
-
-    update_mdatoms(mdatoms, state->lambda[efptMASS]);
-
-    if (ir->efep != efepNO && ir->fepvals->nstdhdl != 0)
-    {
-        doFreeEnergyPerturbation = true;
-    }
-
-    {
-        int cglo_flags =
-                (CGLO_GSTAT
-                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
-        bool   bSumEkinhOld = false;
-        t_vcm* vcm          = nullptr;
-        compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                        makeConstArrayRef(state->v), state->box, mdatoms, nrnb, vcm, nullptr, enerd,
-                        force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, state->box,
-                        &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags);
-    }
-    checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
-                                    makeConstArrayRef(state->x), state->box,
-                                    &shouldCheckNumberOfBondedInteractions);
-
-    if (MASTER(cr))
-    {
-        fprintf(stderr,
-                "starting md rerun '%s', reading coordinates from"
-                " input trajectory '%s'\n\n",
-                *(top_global->name), opt2fn("-rerun", nfile, fnm));
-        if (mdrunOptions.verbose)
-        {
-            fprintf(stderr,
-                    "Calculated time to finish depends on nsteps from "
-                    "run input file,\nwhich may not correspond to the time "
-                    "needed to process input trajectory.\n\n");
-        }
-        fprintf(fplog, "\n");
-    }
-
-    /* PLUMED */
-    if(plumedswitch){
-      /* detect plumed API version */
-      int pversion=0;
-      plumed_cmd(plumedmain,"getApiVersion",&pversion);
-      /* setting kbT is only implemented with api>1) */
-      real kbT=ir->opts.ref_t[0]*BOLTZ;
-      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
-      if(pversion>2){
-        int res=1;
-        if( (startingBehavior != StartingBehavior::NewSimulation) ) plumed_cmd(plumedmain,"setRestart",&res);
-      }
-
-      if(PAR(cr)){
-        if(DOMAINDECOMP(cr)) {
-          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-        }
-      }
-      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
-      plumed_cmd(plumedmain,"setMDEngine","gromacs");
-      plumed_cmd(plumedmain,"setLog",fplog);
-      real real_delta_t=ir->delta_t;
-      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
-      plumed_cmd(plumedmain,"init",nullptr);
-
-      if(PAR(cr)){
-        if(DOMAINDECOMP(cr)) {
-          int nat_home = dd_numHomeAtoms(*cr->dd);
-          plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
-          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
-        }
-      }
-      realFepState = state->fep_state;
-      plumed_cmd(plumedmain, "setExtraCV lambda", &realFepState);
-      plumed_cmd(plumedmain, "setExtraCVForce lambda", &lambdaForce);
-    }
-    /* END PLUMED */
-
-    walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
-    print_start(fplog, cr, walltime_accounting, "mdrun");
-
-    /***********************************************************
-     *
-     *             Loop over MD steps
-     *
-     ************************************************************/
-
-    if (constr)
-    {
-        GMX_LOG(mdlog.info)
-                .asParagraph()
-                .appendText("Simulations has constraints. Rerun does not recalculate constraints.");
-    }
-
-    rerun_fr.natoms = 0;
-    if (MASTER(cr))
-    {
-        isLastStep = !read_first_frame(oenv, &status, opt2fn("-rerun", nfile, fnm), &rerun_fr, TRX_NEED_X);
-        if (rerun_fr.natoms != top_global->natoms)
-        {
-            gmx_fatal(FARGS,
-                      "Number of atoms in trajectory (%d) does not match the "
-                      "run input file (%d)\n",
-                      rerun_fr.natoms, top_global->natoms);
-        }
-
-        if (ir->pbcType != PbcType::No)
-        {
-            if (!rerun_fr.bBox)
-            {
-                gmx_fatal(FARGS,
-                          "Rerun trajectory frame step %" PRId64
-                          " time %f "
-                          "does not contain a box, while pbc is used",
-                          rerun_fr.step, rerun_fr.time);
-            }
-            if (max_cutoff2(ir->pbcType, rerun_fr.box) < gmx::square(fr->rlist))
-            {
-                gmx_fatal(FARGS,
-                          "Rerun trajectory frame step %" PRId64
-                          " time %f "
-                          "has too small box dimensions",
-                          rerun_fr.step, rerun_fr.time);
-            }
-        }
-    }
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Rerun does not report kinetic energy, total energy, temperature, virial and "
-                    "pressure.");
-
-    if (PAR(cr))
-    {
-        rerun_parallel_comm(cr, &rerun_fr, &isLastStep);
-    }
-
-    if (ir->pbcType != PbcType::No)
-    {
-        /* Set the shift vectors.
-         * Necessary here when have a static box different from the tpr box.
-         */
-        calc_shifts(rerun_fr.box, fr->shift_vec);
-    }
-
-    step     = ir->init_step;
-    step_rel = 0;
-
-    auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
-            compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), false, MASTER(cr),
-            ir->nstlist, mdrunOptions.reproducible, nstglobalcomm, mdrunOptions.maximumHoursToRun,
-            ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
-
-    // we don't do counter resetting in rerun - finish will always be valid
-    walltime_accounting_set_valid_finish(walltime_accounting);
-
-    const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
-
-    /* and stop now if we should */
-    isLastStep = (isLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
-    while (!isLastStep)
-    {
-        wallcycle_start(wcycle, ewcSTEP);
-
-        if (rerun_fr.bStep)
-        {
-            step     = rerun_fr.step;
-            step_rel = step - ir->init_step;
-        }
-        if (rerun_fr.bTime)
-        {
-            t = rerun_fr.time;
-        }
-        else
-        {
-            t = step;
-        }
-
-        if (ir->efep != efepNO && MASTER(cr))
-        {
-            if (rerun_fr.bLambda)
-            {
-                ir->fepvals->init_lambda = rerun_fr.lambda;
-            }
-            else
-            {
-                if (rerun_fr.bFepState)
-                {
-                    state->fep_state = rerun_fr.fep_state;
-                }
-            }
-
-            state_global->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
-        }
-
-        if (MASTER(cr))
-        {
-            const bool constructVsites = ((vsite != nullptr) && mdrunOptions.rerunConstructVsites);
-            if (constructVsites && DOMAINDECOMP(cr))
-            {
-                gmx_fatal(FARGS,
-                          "Vsite recalculation with -rerun is not implemented with domain "
-                          "decomposition, "
-                          "use a single rank");
-            }
-            prepareRerunState(rerun_fr, state_global, constructVsites, vsite, ir->delta_t);
-        }
-
-        isLastStep = isLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
-
-        if (DOMAINDECOMP(cr))
-        {
-            /* Repartition the domain decomposition */
-            const bool bMasterState = true;
-            dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
-                                *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
-                                fr, vsite, constr, nrnb, wcycle, mdrunOptions.verbose);
-            shouldCheckNumberOfBondedInteractions = true;
-            /* PLUMED */
-            if(plumedswitch){
-              int nat_home = dd_numHomeAtoms(*cr->dd);
-              plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
-              plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
-            }
-            /* END PLUMED */
-        }
-
-        if (MASTER(cr))
-        {
-            EnergyOutput::printHeader(fplog, step, t); /* can we improve the information printed here? */
-        }
-
-        if (ir->efep != efepNO)
-        {
-            update_mdatoms(mdatoms, state->lambda[efptMASS]);
-        }
-
-        force_flags = (GMX_FORCE_STATECHANGED | GMX_FORCE_DYNAMICBOX | GMX_FORCE_ALLFORCES
-                       | GMX_FORCE_VIRIAL | // TODO: Get rid of this once #2649 and #3400 are solved
-                       GMX_FORCE_ENERGY | (doFreeEnergyPerturbation ? GMX_FORCE_DHDL : 0));
-
-        if (shellfc)
-        {
-            /* Now is the time to relax the shells */
-            relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
-                                imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
-                                state->natoms, state->x.arrayRefWithPadding(),
-                                state->v.arrayRefWithPadding(), state->box, state->lambda,
-                                &state->hist, &f.view(), force_vir, mdatoms, nrnb, wcycle, shellfc,
-                                fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
-        }
-        else
-        {
-            /* The coordinates (x) are shifted (to get whole molecules)
-             * in do_force.
-             * This is parallellized as well, and does communication too.
-             * Check comments in sim_util.c
-             */
-            Awh*       awh = nullptr;
-            gmx_edsam* ed  = nullptr;
-            /* PLUMED */
-            plumedNeedsEnergy=0;
-            if(plumedswitch){
-              int pversion=0;
-              plumed_cmd(plumedmain,"getApiVersion",&pversion);
-              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
-              plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
-              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
-              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
-              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
-              plumed_cmd(plumedmain,"prepareCalc",nullptr);
-              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
-              plumed_cmd(plumedmain,"setForces",&f.view().force()[0][0]);
-              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-              if(plumedNeedsEnergy) force_flags |= GMX_FORCE_ENERGY | GMX_FORCE_VIRIAL;
-              clear_mat(plumed_vir);
-              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
-            }
-            /* END PLUMED */
-            do_force(fplog, cr, ms, ir, awh, enforcedRotation, imdSession, pull_work, step, nrnb,
-                     wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
-                     &f.view(), force_vir, mdatoms, enerd, state->lambda, fr, runScheduleWork,
-                     vsite, mu_tot, t, ed, GMX_FORCE_NS | force_flags, ddBalanceRegionHandler);
-            /* PLUMED */
-            if(plumedswitch){
-              if(plumedNeedsEnergy){
-                msmul(force_vir,2.0,plumed_vir);
-                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-                plumed_cmd(plumedmain,"performCalc",nullptr);
-                msmul(plumed_vir,0.5,force_vir);
-              } else {
-                msmul(plumed_vir,0.5,plumed_vir);
-                m_add(force_vir,plumed_vir,force_vir);
-              }
-              if(plumedWantsToStop) isLastStep = true;
-            }
-            /* END PLUMED */
-        }
-
-        /* Now we have the energies and forces corresponding to the
-         * coordinates at time t.
-         */
-        {
-            const bool isCheckpointingStep = false;
-            const bool doRerun             = true;
-            const bool bSumEkinhOld        = false;
-            do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state,
-                                     state_global, observablesHistory, top_global, fr, outf,
-                                     energyOutput, ekind, f.view().force(), isCheckpointingStep,
-                                     doRerun, isLastStep, mdrunOptions.writeConfout, bSumEkinhOld);
-        }
-
-        stopHandler->setSignal();
-
-        if (vsite != nullptr)
-        {
-            wallcycle_start(wcycle, ewcVSITECONSTR);
-            vsite->construct(state->x, ir->delta_t, state->v, state->box);
-            wallcycle_stop(wcycle, ewcVSITECONSTR);
-        }
-
-        {
-            const bool          doInterSimSignal = false;
-            const bool          doIntraSimSignal = true;
-            bool                bSumEkinhOld     = false;
-            t_vcm*              vcm              = nullptr;
-            SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
-
-            compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                            makeConstArrayRef(state->v), state->box, mdatoms, nrnb, vcm, wcycle,
-                            enerd, force_vir, shake_vir, total_vir, pres, constr, &signaller,
-                            state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                            CGLO_GSTAT | CGLO_ENERGY
-                                    | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
-                                                                             : 0));
-            checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
-                                            &top, makeConstArrayRef(state->x), state->box,
-                                            &shouldCheckNumberOfBondedInteractions);
-        }
-
-        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-           the virial that should probably be addressed eventually. state->veta has better properies,
-           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
-
-        /* Output stuff */
-        if (MASTER(cr))
-        {
-            const bool bCalcEnerStep = true;
-            energyOutput.addDataAtEnergyStep(
-                    doFreeEnergyPerturbation, bCalcEnerStep, t, mdatoms->tmass, enerd, ir->fepvals,
-                    ir->expandedvals, state->box,
-                    PTCouplingArrays({ state->boxv, state->nosehoover_xi, state->nosehoover_vxi,
-                                       state->nhpres_xi, state->nhpres_vxi }),
-                    state->fep_state, shake_vir, force_vir, total_vir, pres, ekind, mu_tot, constr);
-
-            const bool do_ene = true;
-            const bool do_log = true;
-            Awh*       awh    = nullptr;
-            const bool do_dr  = ir->nstdisreout != 0;
-            const bool do_or  = ir->nstorireout != 0;
-
-            EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups, &(ir->opts));
-            energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
-                                               do_log ? fplog : nullptr, step, t, fr->fcdata.get(), awh);
-
-            if (ir->bPull)
-            {
-                pull_print_output(pull_work, step, t);
-            }
-
-            if (do_per_step(step, ir->nstlog))
-            {
-                if (fflush(fplog) != 0)
-                {
-                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
-                }
-            }
-        }
-
-        /* Print the remaining wall clock time for the run */
-        if (isMasterSimMasterRank(ms, MASTER(cr)) && (mdrunOptions.verbose || gmx_got_usr_signal()))
-        {
-            if (shellfc)
-            {
-                fprintf(stderr, "\n");
-            }
-            print_time(stderr, walltime_accounting, step, ir, cr);
-        }
-
-        /* Ion/water position swapping.
-         * Not done in last step since trajectory writing happens before this call
-         * in the MD loop and exchanges would be lost anyway. */
-        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !isLastStep && do_per_step(step, ir->swap->nstswap))
-        {
-            const bool doRerun = true;
-            do_swapcoords(cr, step, t, ir, swap, wcycle, rerun_fr.x, rerun_fr.box,
-                          MASTER(cr) && mdrunOptions.verbose, doRerun);
-        }
-
-        if (MASTER(cr))
-        {
-            /* read next frame from input trajectory */
-            isLastStep = !read_next_frame(oenv, status, &rerun_fr);
-        }
-
-        if (PAR(cr))
-        {
-            rerun_parallel_comm(cr, &rerun_fr, &isLastStep);
-        }
-
-        cycles = wallcycle_stop(wcycle, ewcSTEP);
-        if (DOMAINDECOMP(cr) && wcycle)
-        {
-            dd_cycles_add(cr->dd, cycles, ddCyclStep);
-        }
-
-        if (!rerun_fr.bStep)
-        {
-            /* increase the MD step number */
-            step++;
-            step_rel++;
-        }
-    }
-    /* End of main MD loop */
-
-    /* Closing TNG files can include compressing data. Therefore it is good to do that
-     * before stopping the time measurements. */
-    mdoutf_tng_close(outf);
-
-    /* Stop measuring walltime */
-    walltime_accounting_end_time(walltime_accounting);
-
-    if (MASTER(cr))
-    {
-        close_trx(status);
-    }
-
-    if (!thisRankHasDuty(cr, DUTY_PME))
-    {
-        /* Tell the PME only node to finish */
-        gmx_pme_send_finish(cr);
-    }
-
-    done_mdoutf(outf);
-
-    done_shellfc(fplog, shellfc, step_rel);
-
-    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
-}
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/rerun.cpp.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/rerun.cpp.preplumed
deleted file mode 100644
index 36333d3c94..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/rerun.cpp.preplumed
+++ /dev/null
@@ -1,712 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief Implements the loop for simulation reruns
- *
- * \author Pascal Merz <pascal.merz@colorado.edu>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-
-#include <algorithm>
-#include <memory>
-
-#include "gromacs/applied_forces/awh/awh.h"
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/domdec/collect.h"
-#include "gromacs/domdec/dlbtiming.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_network.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/mdsetup.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/essentialdynamics/edsam.h"
-#include "gromacs/ewald/pme_load_balancing.h"
-#include "gromacs/ewald/pme_pp.h"
-#include "gromacs/fileio/trxio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gpu_utils/gpu_utils.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/utilities.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/math/vectypes.h"
-#include "gromacs/mdlib/checkpointhandler.h"
-#include "gromacs/mdlib/compute_io.h"
-#include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/ebin.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/energyoutput.h"
-#include "gromacs/mdlib/expanded.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/freeenergyparameters.h"
-#include "gromacs/mdlib/md_support.h"
-#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/mdoutf.h"
-#include "gromacs/mdlib/membed.h"
-#include "gromacs/mdlib/resethandler.h"
-#include "gromacs/mdlib/sighandler.h"
-#include "gromacs/mdlib/simulationsignal.h"
-#include "gromacs/mdlib/stat.h"
-#include "gromacs/mdlib/stophandler.h"
-#include "gromacs/mdlib/tgroup.h"
-#include "gromacs/mdlib/trajectory_writing.h"
-#include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/vcm.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdrunutility/handlerestart.h"
-#include "gromacs/mdrunutility/multisim.h"
-#include "gromacs/mdrunutility/printtime.h"
-#include "gromacs/mdtypes/awh_history.h"
-#include "gromacs/mdtypes/awh_params.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/df_history.h"
-#include "gromacs/mdtypes/energyhistory.h"
-#include "gromacs/mdtypes/forcebuffers.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/group.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/interaction_const.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-#include "gromacs/mdtypes/observableshistory.h"
-#include "gromacs/mdtypes/simulation_workload.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/mimic/utilities.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pulling/output.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/swap/swapcoords.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/walltime_accounting.h"
-#include "gromacs/topology/atoms.h"
-#include "gromacs/topology/idef.h"
-#include "gromacs/topology/mtop_util.h"
-#include "gromacs/topology/topology.h"
-#include "gromacs/trajectory/trajectoryframe.h"
-#include "gromacs/utility/basedefinitions.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/real.h"
-
-#include "legacysimulator.h"
-#include "replicaexchange.h"
-#include "shellfc.h"
-
-using gmx::SimulationSignaller;
-using gmx::VirtualSitesHandler;
-
-/*! \brief Copy the state from \p rerunFrame to \p globalState and, if requested, construct vsites
- *
- * \param[in]     rerunFrame      The trajectory frame to compute energy/forces for
- * \param[in,out] globalState     The global state container
- * \param[in]     constructVsites When true, vsite coordinates are constructed
- * \param[in]     vsite           Vsite setup, can be nullptr when \p constructVsites = false
- * \param[in]     timeStep        Time step, used for constructing vsites
- */
-static void prepareRerunState(const t_trxframe&          rerunFrame,
-                              t_state*                   globalState,
-                              bool                       constructVsites,
-                              const VirtualSitesHandler* vsite,
-                              double                     timeStep)
-{
-    auto x      = makeArrayRef(globalState->x);
-    auto rerunX = arrayRefFromArray(reinterpret_cast<gmx::RVec*>(rerunFrame.x), globalState->natoms);
-    std::copy(rerunX.begin(), rerunX.end(), x.begin());
-    copy_mat(rerunFrame.box, globalState->box);
-
-    if (constructVsites)
-    {
-        GMX_ASSERT(vsite, "Need valid vsite for constructing vsites");
-
-        vsite->construct(globalState->x, timeStep, globalState->v, globalState->box);
-    }
-}
-
-void gmx::LegacySimulator::do_rerun()
-{
-    // TODO Historically, the EM and MD "integrators" used different
-    // names for the t_inputrec *parameter, but these must have the
-    // same name, now that it's a member of a struct. We use this ir
-    // alias to avoid a large ripple of nearly useless changes.
-    // t_inputrec is being replaced by IMdpOptionsProvider, so this
-    // will go away eventually.
-    t_inputrec*       ir = inputrec;
-    int64_t           step, step_rel;
-    double            t;
-    bool              isLastStep               = false;
-    bool              doFreeEnergyPerturbation = false;
-    unsigned int      force_flags;
-    tensor            force_vir, shake_vir, total_vir, pres;
-    t_trxstatus*      status = nullptr;
-    rvec              mu_tot;
-    t_trxframe        rerun_fr;
-    gmx_localtop_t    top(top_global->ffparams);
-    ForceBuffers      f;
-    gmx_global_stat_t gstat;
-    gmx_shellfc_t*    shellfc;
-
-    double cycles;
-
-    /* Domain decomposition could incorrectly miss a bonded
-       interaction, but checking for that requires a global
-       communication stage, which does not otherwise happen in DD
-       code. So we do that alongside the first global energy reduction
-       after a new DD is made. These variables handle whether the
-       check happens, and the result it returns. */
-    bool shouldCheckNumberOfBondedInteractions = false;
-    int  totalNumberOfBondedInteractions       = -1;
-
-    SimulationSignals signals;
-    // Most global communnication stages don't propagate mdrun
-    // signals, and will use this object to achieve that.
-    SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Note that it is planned that the command gmx mdrun -rerun will "
-                    "be available in a different form in a future version of GROMACS, "
-                    "e.g. gmx rerun -f.");
-
-    if (ir->efep != efepNO
-        && (mdAtoms->mdatoms()->nMassPerturbed > 0 || (constr && constr->havePerturbedConstraints())))
-    {
-        gmx_fatal(FARGS,
-                  "Perturbed masses or constraints are not supported by rerun. "
-                  "Either make a .tpr without mass and constraint perturbation, "
-                  "or use GROMACS 2018.4, 2018.5 or later 2018 version.");
-    }
-    if (ir->bExpanded)
-    {
-        gmx_fatal(FARGS, "Expanded ensemble not supported by rerun.");
-    }
-    if (ir->bSimTemp)
-    {
-        gmx_fatal(FARGS, "Simulated tempering not supported by rerun.");
-    }
-    if (ir->bDoAwh)
-    {
-        gmx_fatal(FARGS, "AWH not supported by rerun.");
-    }
-    if (replExParams.exchangeInterval > 0)
-    {
-        gmx_fatal(FARGS, "Replica exchange not supported by rerun.");
-    }
-    if (opt2bSet("-ei", nfile, fnm) || observablesHistory->edsamHistory != nullptr)
-    {
-        gmx_fatal(FARGS, "Essential dynamics not supported by rerun.");
-    }
-    if (ir->bIMD)
-    {
-        gmx_fatal(FARGS, "Interactive MD not supported by rerun.");
-    }
-    if (isMultiSim(ms))
-    {
-        gmx_fatal(FARGS, "Multiple simulations not supported by rerun.");
-    }
-    if (std::any_of(ir->opts.annealing, ir->opts.annealing + ir->opts.ngtc,
-                    [](int i) { return i != eannNO; }))
-    {
-        gmx_fatal(FARGS, "Simulated annealing not supported by rerun.");
-    }
-
-    /* Rerun can't work if an output file name is the same as the input file name.
-     * If this is the case, the user will get an error telling them what the issue is.
-     */
-    if (strcmp(opt2fn("-rerun", nfile, fnm), opt2fn("-o", nfile, fnm)) == 0
-        || strcmp(opt2fn("-rerun", nfile, fnm), opt2fn("-x", nfile, fnm)) == 0)
-    {
-        gmx_fatal(FARGS,
-                  "When using mdrun -rerun, the name of the input trajectory file "
-                  "%s cannot be identical to the name of an output file (whether "
-                  "given explicitly with -o or -x, or by default)",
-                  opt2fn("-rerun", nfile, fnm));
-    }
-
-    /* Settings for rerun */
-    ir->nstlist              = 1;
-    ir->nstcalcenergy        = 1;
-    int        nstglobalcomm = 1;
-    const bool bNS           = true;
-
-    ir->nstxout_compressed         = 0;
-    const SimulationGroups* groups = &top_global->groups;
-    if (ir->eI == eiMimic)
-    {
-        auto nonConstGlobalTopology                          = const_cast<gmx_mtop_t*>(top_global);
-        nonConstGlobalTopology->intermolecularExclusionGroup = genQmmmIndices(*top_global);
-    }
-    int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
-    gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
-    initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
-    const bool        simulationsShareState = false;
-    gmx_mdoutf*       outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
-                                   mdModulesNotifier, ir, top_global, oenv, wcycle,
-                                   StartingBehavior::NewSimulation, simulationsShareState, ms);
-    gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
-                                   mdoutf_get_fp_dhdl(outf), true, StartingBehavior::NewSimulation,
-                                   simulationsShareState, mdModulesNotifier);
-
-    gstat = global_stat_init(ir);
-
-    /* Check for polarizable models and flexible constraints */
-    shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
-                                 ir->nstcalcenergy, DOMAINDECOMP(cr),
-                                 runScheduleWork->simulationWork.useGpuPme);
-
-    {
-        double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
-        if ((io > 2000) && MASTER(cr))
-        {
-            fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
-        }
-    }
-
-    // Local state only becomes valid now.
-    std::unique_ptr<t_state> stateInstance;
-    t_state*                 state;
-
-    if (DOMAINDECOMP(cr))
-    {
-        stateInstance = std::make_unique<t_state>();
-        state         = stateInstance.get();
-        dd_init_local_state(cr->dd, state_global, state);
-
-        /* Distribute the charge groups over the nodes from the master node */
-        dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
-                            imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
-                            nrnb, nullptr, FALSE);
-        shouldCheckNumberOfBondedInteractions = true;
-    }
-    else
-    {
-        state_change_natoms(state_global, state_global->natoms);
-        /* Copy the pointer to the global state */
-        state = state_global;
-
-        mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
-    }
-
-    auto mdatoms = mdAtoms->mdatoms();
-
-    // NOTE: The global state is no longer used at this point.
-    // But state_global is still used as temporary storage space for writing
-    // the global state to file and potentially for replica exchange.
-    // (Global topology should persist.)
-
-    update_mdatoms(mdatoms, state->lambda[efptMASS]);
-
-    if (ir->efep != efepNO && ir->fepvals->nstdhdl != 0)
-    {
-        doFreeEnergyPerturbation = true;
-    }
-
-    {
-        int cglo_flags =
-                (CGLO_GSTAT
-                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
-        bool   bSumEkinhOld = false;
-        t_vcm* vcm          = nullptr;
-        compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                        makeConstArrayRef(state->v), state->box, mdatoms, nrnb, vcm, nullptr, enerd,
-                        force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, state->box,
-                        &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags);
-    }
-    checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
-                                    makeConstArrayRef(state->x), state->box,
-                                    &shouldCheckNumberOfBondedInteractions);
-
-    if (MASTER(cr))
-    {
-        fprintf(stderr,
-                "starting md rerun '%s', reading coordinates from"
-                " input trajectory '%s'\n\n",
-                *(top_global->name), opt2fn("-rerun", nfile, fnm));
-        if (mdrunOptions.verbose)
-        {
-            fprintf(stderr,
-                    "Calculated time to finish depends on nsteps from "
-                    "run input file,\nwhich may not correspond to the time "
-                    "needed to process input trajectory.\n\n");
-        }
-        fprintf(fplog, "\n");
-    }
-
-    walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
-    print_start(fplog, cr, walltime_accounting, "mdrun");
-
-    /***********************************************************
-     *
-     *             Loop over MD steps
-     *
-     ************************************************************/
-
-    if (constr)
-    {
-        GMX_LOG(mdlog.info)
-                .asParagraph()
-                .appendText("Simulations has constraints. Rerun does not recalculate constraints.");
-    }
-
-    rerun_fr.natoms = 0;
-    if (MASTER(cr))
-    {
-        isLastStep = !read_first_frame(oenv, &status, opt2fn("-rerun", nfile, fnm), &rerun_fr, TRX_NEED_X);
-        if (rerun_fr.natoms != top_global->natoms)
-        {
-            gmx_fatal(FARGS,
-                      "Number of atoms in trajectory (%d) does not match the "
-                      "run input file (%d)\n",
-                      rerun_fr.natoms, top_global->natoms);
-        }
-
-        if (ir->pbcType != PbcType::No)
-        {
-            if (!rerun_fr.bBox)
-            {
-                gmx_fatal(FARGS,
-                          "Rerun trajectory frame step %" PRId64
-                          " time %f "
-                          "does not contain a box, while pbc is used",
-                          rerun_fr.step, rerun_fr.time);
-            }
-            if (max_cutoff2(ir->pbcType, rerun_fr.box) < gmx::square(fr->rlist))
-            {
-                gmx_fatal(FARGS,
-                          "Rerun trajectory frame step %" PRId64
-                          " time %f "
-                          "has too small box dimensions",
-                          rerun_fr.step, rerun_fr.time);
-            }
-        }
-    }
-
-    GMX_LOG(mdlog.info)
-            .asParagraph()
-            .appendText(
-                    "Rerun does not report kinetic energy, total energy, temperature, virial and "
-                    "pressure.");
-
-    if (PAR(cr))
-    {
-        rerun_parallel_comm(cr, &rerun_fr, &isLastStep);
-    }
-
-    if (ir->pbcType != PbcType::No)
-    {
-        /* Set the shift vectors.
-         * Necessary here when have a static box different from the tpr box.
-         */
-        calc_shifts(rerun_fr.box, fr->shift_vec);
-    }
-
-    step     = ir->init_step;
-    step_rel = 0;
-
-    auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
-            compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), false, MASTER(cr),
-            ir->nstlist, mdrunOptions.reproducible, nstglobalcomm, mdrunOptions.maximumHoursToRun,
-            ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
-
-    // we don't do counter resetting in rerun - finish will always be valid
-    walltime_accounting_set_valid_finish(walltime_accounting);
-
-    const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
-
-    /* and stop now if we should */
-    isLastStep = (isLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
-    while (!isLastStep)
-    {
-        wallcycle_start(wcycle, ewcSTEP);
-
-        if (rerun_fr.bStep)
-        {
-            step     = rerun_fr.step;
-            step_rel = step - ir->init_step;
-        }
-        if (rerun_fr.bTime)
-        {
-            t = rerun_fr.time;
-        }
-        else
-        {
-            t = step;
-        }
-
-        if (ir->efep != efepNO && MASTER(cr))
-        {
-            if (rerun_fr.bLambda)
-            {
-                ir->fepvals->init_lambda = rerun_fr.lambda;
-            }
-            else
-            {
-                if (rerun_fr.bFepState)
-                {
-                    state->fep_state = rerun_fr.fep_state;
-                }
-            }
-
-            state_global->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
-        }
-
-        if (MASTER(cr))
-        {
-            const bool constructVsites = ((vsite != nullptr) && mdrunOptions.rerunConstructVsites);
-            if (constructVsites && DOMAINDECOMP(cr))
-            {
-                gmx_fatal(FARGS,
-                          "Vsite recalculation with -rerun is not implemented with domain "
-                          "decomposition, "
-                          "use a single rank");
-            }
-            prepareRerunState(rerun_fr, state_global, constructVsites, vsite, ir->delta_t);
-        }
-
-        isLastStep = isLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
-
-        if (DOMAINDECOMP(cr))
-        {
-            /* Repartition the domain decomposition */
-            const bool bMasterState = true;
-            dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
-                                *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
-                                fr, vsite, constr, nrnb, wcycle, mdrunOptions.verbose);
-            shouldCheckNumberOfBondedInteractions = true;
-        }
-
-        if (MASTER(cr))
-        {
-            EnergyOutput::printHeader(fplog, step, t); /* can we improve the information printed here? */
-        }
-
-        if (ir->efep != efepNO)
-        {
-            update_mdatoms(mdatoms, state->lambda[efptMASS]);
-        }
-
-        force_flags = (GMX_FORCE_STATECHANGED | GMX_FORCE_DYNAMICBOX | GMX_FORCE_ALLFORCES
-                       | GMX_FORCE_VIRIAL | // TODO: Get rid of this once #2649 and #3400 are solved
-                       GMX_FORCE_ENERGY | (doFreeEnergyPerturbation ? GMX_FORCE_DHDL : 0));
-
-        if (shellfc)
-        {
-            /* Now is the time to relax the shells */
-            relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
-                                imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
-                                state->natoms, state->x.arrayRefWithPadding(),
-                                state->v.arrayRefWithPadding(), state->box, state->lambda,
-                                &state->hist, &f.view(), force_vir, mdatoms, nrnb, wcycle, shellfc,
-                                fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
-        }
-        else
-        {
-            /* The coordinates (x) are shifted (to get whole molecules)
-             * in do_force.
-             * This is parallellized as well, and does communication too.
-             * Check comments in sim_util.c
-             */
-            Awh*       awh = nullptr;
-            gmx_edsam* ed  = nullptr;
-            do_force(fplog, cr, ms, ir, awh, enforcedRotation, imdSession, pull_work, step, nrnb,
-                     wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
-                     &f.view(), force_vir, mdatoms, enerd, state->lambda, fr, runScheduleWork,
-                     vsite, mu_tot, t, ed, GMX_FORCE_NS | force_flags, ddBalanceRegionHandler);
-        }
-
-        /* Now we have the energies and forces corresponding to the
-         * coordinates at time t.
-         */
-        {
-            const bool isCheckpointingStep = false;
-            const bool doRerun             = true;
-            const bool bSumEkinhOld        = false;
-            do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state,
-                                     state_global, observablesHistory, top_global, fr, outf,
-                                     energyOutput, ekind, f.view().force(), isCheckpointingStep,
-                                     doRerun, isLastStep, mdrunOptions.writeConfout, bSumEkinhOld);
-        }
-
-        stopHandler->setSignal();
-
-        if (vsite != nullptr)
-        {
-            wallcycle_start(wcycle, ewcVSITECONSTR);
-            vsite->construct(state->x, ir->delta_t, state->v, state->box);
-            wallcycle_stop(wcycle, ewcVSITECONSTR);
-        }
-
-        {
-            const bool          doInterSimSignal = false;
-            const bool          doIntraSimSignal = true;
-            bool                bSumEkinhOld     = false;
-            t_vcm*              vcm              = nullptr;
-            SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
-
-            compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
-                            makeConstArrayRef(state->v), state->box, mdatoms, nrnb, vcm, wcycle,
-                            enerd, force_vir, shake_vir, total_vir, pres, constr, &signaller,
-                            state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
-                            CGLO_GSTAT | CGLO_ENERGY
-                                    | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
-                                                                             : 0));
-            checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
-                                            &top, makeConstArrayRef(state->x), state->box,
-                                            &shouldCheckNumberOfBondedInteractions);
-        }
-
-        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-           the virial that should probably be addressed eventually. state->veta has better properies,
-           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
-
-        /* Output stuff */
-        if (MASTER(cr))
-        {
-            const bool bCalcEnerStep = true;
-            energyOutput.addDataAtEnergyStep(
-                    doFreeEnergyPerturbation, bCalcEnerStep, t, mdatoms->tmass, enerd, ir->fepvals,
-                    ir->expandedvals, state->box,
-                    PTCouplingArrays({ state->boxv, state->nosehoover_xi, state->nosehoover_vxi,
-                                       state->nhpres_xi, state->nhpres_vxi }),
-                    state->fep_state, shake_vir, force_vir, total_vir, pres, ekind, mu_tot, constr);
-
-            const bool do_ene = true;
-            const bool do_log = true;
-            Awh*       awh    = nullptr;
-            const bool do_dr  = ir->nstdisreout != 0;
-            const bool do_or  = ir->nstorireout != 0;
-
-            EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups, &(ir->opts));
-            energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
-                                               do_log ? fplog : nullptr, step, t, fr->fcdata.get(), awh);
-
-            if (ir->bPull)
-            {
-                pull_print_output(pull_work, step, t);
-            }
-
-            if (do_per_step(step, ir->nstlog))
-            {
-                if (fflush(fplog) != 0)
-                {
-                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
-                }
-            }
-        }
-
-        /* Print the remaining wall clock time for the run */
-        if (isMasterSimMasterRank(ms, MASTER(cr)) && (mdrunOptions.verbose || gmx_got_usr_signal()))
-        {
-            if (shellfc)
-            {
-                fprintf(stderr, "\n");
-            }
-            print_time(stderr, walltime_accounting, step, ir, cr);
-        }
-
-        /* Ion/water position swapping.
-         * Not done in last step since trajectory writing happens before this call
-         * in the MD loop and exchanges would be lost anyway. */
-        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !isLastStep && do_per_step(step, ir->swap->nstswap))
-        {
-            const bool doRerun = true;
-            do_swapcoords(cr, step, t, ir, swap, wcycle, rerun_fr.x, rerun_fr.box,
-                          MASTER(cr) && mdrunOptions.verbose, doRerun);
-        }
-
-        if (MASTER(cr))
-        {
-            /* read next frame from input trajectory */
-            isLastStep = !read_next_frame(oenv, status, &rerun_fr);
-        }
-
-        if (PAR(cr))
-        {
-            rerun_parallel_comm(cr, &rerun_fr, &isLastStep);
-        }
-
-        cycles = wallcycle_stop(wcycle, ewcSTEP);
-        if (DOMAINDECOMP(cr) && wcycle)
-        {
-            dd_cycles_add(cr->dd, cycles, ddCyclStep);
-        }
-
-        if (!rerun_fr.bStep)
-        {
-            /* increase the MD step number */
-            step++;
-            step_rel++;
-        }
-    }
-    /* End of main MD loop */
-
-    /* Closing TNG files can include compressing data. Therefore it is good to do that
-     * before stopping the time measurements. */
-    mdoutf_tng_close(outf);
-
-    /* Stop measuring walltime */
-    walltime_accounting_end_time(walltime_accounting);
-
-    if (MASTER(cr))
-    {
-        close_trx(status);
-    }
-
-    if (!thisRankHasDuty(cr, DUTY_PME))
-    {
-        /* Tell the PME only node to finish */
-        gmx_pme_send_finish(cr);
-    }
-
-    done_mdoutf(outf);
-
-    done_shellfc(fplog, shellfc, step_rel);
-
-    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
-}
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/runner.cpp b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/runner.cpp
deleted file mode 100644
index eaaf3ae457..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/runner.cpp
+++ /dev/null
@@ -1,2394 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief Implements the MD runner routine calling all integrators.
- *
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include "runner.h"
-
-#include "config.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <csignal>
-#include <cstdlib>
-#include <cstring>
-
-#include <algorithm>
-#include <memory>
-
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/domdec/builder.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/gpuhaloexchange.h"
-#include "gromacs/domdec/localatomsetmanager.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/ewald/ewald_utils.h"
-#include "gromacs/ewald/pme_gpu_program.h"
-#include "gromacs/ewald/pme_only.h"
-#include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/fileio/checkpoint.h"
-#include "gromacs/fileio/gmxfio.h"
-#include "gromacs/fileio/oenv.h"
-#include "gromacs/fileio/tpxio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gpu_utils/device_stream_manager.h"
-#include "gromacs/hardware/cpuinfo.h"
-#include "gromacs/hardware/detecthardware.h"
-#include "gromacs/hardware/device_management.h"
-#include "gromacs/hardware/hardwaretopology.h"
-#include "gromacs/hardware/printhardware.h"
-#include "gromacs/imd/imd.h"
-#include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/gpubonded.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/listed_forces/orires.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/utilities.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/boxdeformation.h"
-#include "gromacs/mdlib/broadcaststructs.h"
-#include "gromacs/mdlib/calc_verletbuf.h"
-#include "gromacs/mdlib/dispersioncorrection.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/gpuforcereduction.h"
-#include "gromacs/mdlib/makeconstraints.h"
-#include "gromacs/mdlib/md_support.h"
-#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/sighandler.h"
-#include "gromacs/mdlib/stophandler.h"
-#include "gromacs/mdlib/tgroup.h"
-#include "gromacs/mdlib/updategroups.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdrun/mdmodules.h"
-#include "gromacs/mdrun/simulationcontext.h"
-#include "gromacs/mdrun/simulationinput.h"
-#include "gromacs/mdrun/simulationinputhandle.h"
-#include "gromacs/mdrunutility/handlerestart.h"
-#include "gromacs/mdrunutility/logging.h"
-#include "gromacs/mdrunutility/multisim.h"
-#include "gromacs/mdrunutility/printtime.h"
-#include "gromacs/mdrunutility/threadaffinity.h"
-#include "gromacs/mdtypes/checkpointdata.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/enerdata.h"
-#include "gromacs/mdtypes/fcdata.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/group.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/interaction_const.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-#include "gromacs/mdtypes/observableshistory.h"
-#include "gromacs/mdtypes/simulation_workload.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/mdtypes/state_propagator_data_gpu.h"
-#include "gromacs/modularsimulator/modularsimulator.h"
-#include "gromacs/nbnxm/gpu_data_mgmt.h"
-#include "gromacs/nbnxm/nbnxm.h"
-#include "gromacs/nbnxm/pairlist_tuning.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pulling/output.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/pulling/pull_rotation.h"
-#include "gromacs/restraint/manager.h"
-#include "gromacs/restraint/restraintmdmodule.h"
-#include "gromacs/restraint/restraintpotential.h"
-#include "gromacs/swap/swapcoords.h"
-#include "gromacs/taskassignment/decidegpuusage.h"
-#include "gromacs/taskassignment/decidesimulationworkload.h"
-#include "gromacs/taskassignment/resourcedivision.h"
-#include "gromacs/taskassignment/taskassignment.h"
-#include "gromacs/taskassignment/usergpuids.h"
-#include "gromacs/timing/gpu_timing.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/wallcyclereporting.h"
-#include "gromacs/topology/mtop_util.h"
-#include "gromacs/trajectory/trajectoryframe.h"
-#include "gromacs/utility/basenetwork.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/exceptions.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/filestream.h"
-#include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/keyvaluetree.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/loggerbuilder.h"
-#include "gromacs/utility/mdmodulenotification.h"
-#include "gromacs/utility/physicalnodecommunicator.h"
-#include "gromacs/utility/pleasecite.h"
-#include "gromacs/utility/programcontext.h"
-#include "gromacs/utility/smalloc.h"
-#include "gromacs/utility/stringutil.h"
-
-#include "isimulator.h"
-#include "membedholder.h"
-#include "replicaexchange.h"
-#include "simulatorbuilder.h"
-
-/* PLUMED */
-#include "../../../Plumed.h"
-int    plumedswitch;
-plumed plumedmain; 
-/* END PLUMED */
-
-/* PLUMED HREX */
-int plumed_hrex;
-/* END PLUMED HREX */
-
-namespace gmx
-{
-
-
-/*! \brief Manage any development feature flag variables encountered
- *
- * The use of dev features indicated by environment variables is
- * logged in order to ensure that runs with such features enabled can
- * be identified from their log and standard output. Any cross
- * dependencies are also checked, and if unsatisfied, a fatal error
- * issued.
- *
- * Note that some development features overrides are applied already here:
- * the GPU communication flags are set to false in non-tMPI and non-CUDA builds.
- *
- * \param[in]  mdlog                Logger object.
- * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
- * \param[in]  pmeRunMode           The PME run mode for this run
- * \returns                         The object populated with development feature flags.
- */
-static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& mdlog,
-                                                         const bool           useGpuForNonbonded,
-                                                         const PmeRunMode     pmeRunMode)
-{
-    DevelopmentFeatureFlags devFlags;
-
-    // Some builds of GCC 5 give false positive warnings that these
-    // getenv results are ignored when clearly they are used.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-result"
-
-    devFlags.enableGpuBufferOps =
-            GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
-    devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_DD_COMMS") != nullptr;
-    devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
-    devFlags.enableGpuPmePPComm =
-            GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
-
-#pragma GCC diagnostic pop
-
-    if (devFlags.enableGpuBufferOps)
-    {
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendTextFormatted(
-                        "This run uses the 'GPU buffer ops' feature, enabled by the "
-                        "GMX_USE_GPU_BUFFER_OPS environment variable.");
-    }
-
-    if (devFlags.forceGpuUpdateDefault)
-    {
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendTextFormatted(
-                        "This run will default to '-update gpu' as requested by the "
-                        "GMX_FORCE_UPDATE_DEFAULT_GPU environment variable. GPU update with domain "
-                        "decomposition lacks substantial testing and should be used with caution.");
-    }
-
-    if (devFlags.enableGpuHaloExchange)
-    {
-        if (useGpuForNonbonded)
-        {
-            if (!devFlags.enableGpuBufferOps)
-            {
-                GMX_LOG(mdlog.warning)
-                        .asParagraph()
-                        .appendTextFormatted(
-                                "Enabling GPU buffer operations required by GMX_GPU_DD_COMMS "
-                                "(equivalent with GMX_USE_GPU_BUFFER_OPS=1).");
-                devFlags.enableGpuBufferOps = true;
-            }
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendTextFormatted(
-                            "This run has requested the 'GPU halo exchange' feature, enabled by "
-                            "the "
-                            "GMX_GPU_DD_COMMS environment variable.");
-        }
-        else
-        {
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendTextFormatted(
-                            "GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
-                            "halo exchange' feature will not be enabled as nonbonded interactions "
-                            "are not offloaded.");
-            devFlags.enableGpuHaloExchange = false;
-        }
-    }
-
-    if (devFlags.enableGpuPmePPComm)
-    {
-        if (pmeRunMode == PmeRunMode::GPU)
-        {
-            if (!devFlags.enableGpuBufferOps)
-            {
-                GMX_LOG(mdlog.warning)
-                        .asParagraph()
-                        .appendTextFormatted(
-                                "Enabling GPU buffer operations required by GMX_GPU_PME_PP_COMMS "
-                                "(equivalent with GMX_USE_GPU_BUFFER_OPS=1).");
-                devFlags.enableGpuBufferOps = true;
-            }
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendTextFormatted(
-                            "This run uses the 'GPU PME-PP communications' feature, enabled "
-                            "by the GMX_GPU_PME_PP_COMMS environment variable.");
-        }
-        else
-        {
-            std::string clarification;
-            if (pmeRunMode == PmeRunMode::Mixed)
-            {
-                clarification =
-                        "PME FFT and gather are not offloaded to the GPU (PME is running in mixed "
-                        "mode).";
-            }
-            else
-            {
-                clarification = "PME is not offloaded to the GPU.";
-            }
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendText(
-                            "GMX_GPU_PME_PP_COMMS environment variable detected, but the "
-                            "'GPU PME-PP communications' feature was not enabled as "
-                            + clarification);
-            devFlags.enableGpuPmePPComm = false;
-        }
-    }
-
-    return devFlags;
-}
-
-/*! \brief Barrier for safe simultaneous thread access to mdrunner data
- *
- * Used to ensure that the master thread does not modify mdrunner during copy
- * on the spawned threads. */
-static void threadMpiMdrunnerAccessBarrier()
-{
-#if GMX_THREAD_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-}
-
-Mdrunner Mdrunner::cloneOnSpawnedThread() const
-{
-    auto newRunner = Mdrunner(std::make_unique<MDModules>());
-
-    // All runners in the same process share a restraint manager resource because it is
-    // part of the interface to the client code, which is associated only with the
-    // original thread. Handles to the same resources can be obtained by copy.
-    {
-        newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
-    }
-
-    // Copy members of master runner.
-    // \todo Replace with builder when Simulation context and/or runner phases are better defined.
-    // Ref https://gitlab.com/gromacs/gromacs/-/issues/2587 and https://gitlab.com/gromacs/gromacs/-/issues/2375
-    newRunner.hw_opt    = hw_opt;
-    newRunner.filenames = filenames;
-
-    newRunner.hwinfo_         = hwinfo_;
-    newRunner.oenv            = oenv;
-    newRunner.mdrunOptions    = mdrunOptions;
-    newRunner.domdecOptions   = domdecOptions;
-    newRunner.nbpu_opt        = nbpu_opt;
-    newRunner.pme_opt         = pme_opt;
-    newRunner.pme_fft_opt     = pme_fft_opt;
-    newRunner.bonded_opt      = bonded_opt;
-    newRunner.update_opt      = update_opt;
-    newRunner.nstlist_cmdline = nstlist_cmdline;
-    newRunner.replExParams    = replExParams;
-    newRunner.pforce          = pforce;
-    // Give the spawned thread the newly created valid communicator
-    // for the simulation.
-    newRunner.libraryWorldCommunicator = MPI_COMM_WORLD;
-    newRunner.simulationCommunicator   = MPI_COMM_WORLD;
-    newRunner.ms                       = ms;
-    newRunner.startingBehavior         = startingBehavior;
-    newRunner.stopHandlerBuilder_      = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
-    newRunner.inputHolder_             = inputHolder_;
-
-    threadMpiMdrunnerAccessBarrier();
-
-    return newRunner;
-}
-
-/*! \brief The callback used for running on spawned threads.
- *
- * Obtains the pointer to the master mdrunner object from the one
- * argument permitted to the thread-launch API call, copies it to make
- * a new runner for this thread, reinitializes necessary data, and
- * proceeds to the simulation. */
-static void mdrunner_start_fn(const void* arg)
-{
-    try
-    {
-        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
-        /* copy the arg list to make sure that it's thread-local. This
-           doesn't copy pointed-to items, of course; fnm, cr and fplog
-           are reset in the call below, all others should be const. */
-        gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
-        mdrunner.mdrunner();
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-}
-
-
-void Mdrunner::spawnThreads(int numThreadsToLaunch)
-{
-#if GMX_THREAD_MPI
-    /* now spawn new threads that start mdrunner_start_fn(), while
-       the main thread returns. Thread affinity is handled later. */
-    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, mdrunner_start_fn,
-                     static_cast<const void*>(this))
-        != TMPI_SUCCESS)
-    {
-        GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
-    }
-
-    // Give the master thread the newly created valid communicator for
-    // the simulation.
-    libraryWorldCommunicator = MPI_COMM_WORLD;
-    simulationCommunicator   = MPI_COMM_WORLD;
-    threadMpiMdrunnerAccessBarrier();
-#else
-    GMX_UNUSED_VALUE(numThreadsToLaunch);
-    GMX_UNUSED_VALUE(mdrunner_start_fn);
-#endif
-}
-
-} // namespace gmx
-
-/*! \brief Initialize variables for Verlet scheme simulation */
-static void prepare_verlet_scheme(FILE*               fplog,
-                                  t_commrec*          cr,
-                                  t_inputrec*         ir,
-                                  int                 nstlist_cmdline,
-                                  const gmx_mtop_t*   mtop,
-                                  const matrix        box,
-                                  bool                makeGpuPairList,
-                                  const gmx::CpuInfo& cpuinfo)
-{
-    // We checked the cut-offs in grompp, but double-check here.
-    // We have PME+LJcutoff kernels for rcoulomb>rvdw.
-    if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == eelCUT)
-    {
-        GMX_RELEASE_ASSERT(ir->rcoulomb >= ir->rvdw,
-                           "With Verlet lists and PME we should have rcoulomb>=rvdw");
-    }
-    else
-    {
-        GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw,
-                           "With Verlet lists and no PME rcoulomb and rvdw should be identical");
-    }
-    /* For NVE simulations, we will retain the initial list buffer */
-    if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
-    {
-        /* Update the Verlet buffer size for the current run setup */
-
-        /* Here we assume SIMD-enabled kernels are being used. But as currently
-         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
-         * and 4x2 gives a larger buffer than 4x4, this is ok.
-         */
-        ListSetupType listType =
-                (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
-        VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
-
-        const real rlist_new =
-                calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
-
-        if (rlist_new != ir->rlist)
-        {
-            if (fplog != nullptr)
-            {
-                fprintf(fplog,
-                        "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
-                        ir->rlist, rlist_new, listSetup.cluster_size_i, listSetup.cluster_size_j);
-            }
-            ir->rlist = rlist_new;
-        }
-    }
-
-    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
-    {
-        gmx_fatal(FARGS, "Can not set nstlist without %s",
-                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
-    }
-
-    if (EI_DYNAMICS(ir->eI))
-    {
-        /* Set or try nstlist values */
-        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
-    }
-}
-
-/*! \brief Override the nslist value in inputrec
- *
- * with value passed on the command line (if any)
- */
-static void override_nsteps_cmdline(const gmx::MDLogger& mdlog, int64_t nsteps_cmdline, t_inputrec* ir)
-{
-    assert(ir);
-
-    /* override with anything else than the default -2 */
-    if (nsteps_cmdline > -2)
-    {
-        char sbuf_steps[STEPSTRSIZE];
-        char sbuf_msg[STRLEN];
-
-        ir->nsteps = nsteps_cmdline;
-        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
-        {
-            sprintf(sbuf_msg,
-                    "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
-                    gmx_step_str(nsteps_cmdline, sbuf_steps), fabs(nsteps_cmdline * ir->delta_t));
-        }
-        else
-        {
-            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
-                    gmx_step_str(nsteps_cmdline, sbuf_steps));
-        }
-
-        GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
-    }
-    else if (nsteps_cmdline < -2)
-    {
-        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64, nsteps_cmdline);
-    }
-    /* Do nothing if nsteps_cmdline == -2 */
-}
-
-namespace gmx
-{
-
-/*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
- *
- * If not, and if a warning may be issued, logs a warning about
- * falling back to CPU code. With thread-MPI, only the first
- * call to this function should have \c issueWarning true. */
-static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger& mdlog, const t_inputrec& ir, bool issueWarning)
-{
-    bool        gpuIsUseful = true;
-    std::string warning;
-
-    if (ir.opts.ngener - ir.nwall > 1)
-    {
-        /* The GPU code does not support more than one energy group.
-         * If the user requested GPUs explicitly, a fatal error is given later.
-         */
-        gpuIsUseful = false;
-        warning =
-                "Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
-                "For better performance, run on the GPU without energy groups and then do "
-                "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.";
-    }
-
-    if (EI_TPI(ir.eI))
-    {
-        gpuIsUseful = false;
-        warning     = "TPI is not implemented for GPUs.";
-    }
-
-    if (!gpuIsUseful && issueWarning)
-    {
-        GMX_LOG(mdlog.warning).asParagraph().appendText(warning);
-    }
-
-    return gpuIsUseful;
-}
-
-//! Initializes the logger for mdrun.
-static gmx::LoggerOwner buildLogger(FILE* fplog, const bool isSimulationMasterRank)
-{
-    gmx::LoggerBuilder builder;
-    if (fplog != nullptr)
-    {
-        builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
-    }
-    if (isSimulationMasterRank)
-    {
-        builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
-    }
-    return builder.build();
-}
-
-//! Make a TaskTarget from an mdrun argument string.
-static TaskTarget findTaskTarget(const char* optionString)
-{
-    TaskTarget returnValue = TaskTarget::Auto;
-
-    if (strncmp(optionString, "auto", 3) == 0)
-    {
-        returnValue = TaskTarget::Auto;
-    }
-    else if (strncmp(optionString, "cpu", 3) == 0)
-    {
-        returnValue = TaskTarget::Cpu;
-    }
-    else if (strncmp(optionString, "gpu", 3) == 0)
-    {
-        returnValue = TaskTarget::Gpu;
-    }
-    else
-    {
-        GMX_ASSERT(false, "Option string should have been checked for sanity already");
-    }
-
-    return returnValue;
-}
-
-//! Finish run, aggregate data to print performance info.
-static void finish_run(FILE*                     fplog,
-                       const gmx::MDLogger&      mdlog,
-                       const t_commrec*          cr,
-                       const t_inputrec*         inputrec,
-                       t_nrnb                    nrnb[],
-                       gmx_wallcycle_t           wcycle,
-                       gmx_walltime_accounting_t walltime_accounting,
-                       nonbonded_verlet_t*       nbv,
-                       const gmx_pme_t*          pme,
-                       gmx_bool                  bWriteStat)
-{
-    double delta_t = 0;
-    double nbfs = 0, mflop = 0;
-    double elapsed_time, elapsed_time_over_all_ranks, elapsed_time_over_all_threads,
-            elapsed_time_over_all_threads_over_all_ranks;
-    /* Control whether it is valid to print a report. Only the
-       simulation master may print, but it should not do so if the run
-       terminated e.g. before a scheduled reset step. This is
-       complicated by the fact that PME ranks are unaware of the
-       reason why they were sent a pmerecvqxFINISH. To avoid
-       communication deadlocks, we always do the communication for the
-       report, even if we've decided not to write the report, because
-       how long it takes to finish the run is not important when we've
-       decided not to report on the simulation performance.
-
-       Further, we only report performance for dynamical integrators,
-       because those are the only ones for which we plan to
-       consider doing any optimizations. */
-    bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
-
-    if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
-    {
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendText("Simulation ended prematurely, no performance report will be written.");
-        printReport = false;
-    }
-
-    t_nrnb*                 nrnb_tot;
-    std::unique_ptr<t_nrnb> nrnbTotalStorage;
-    if (cr->nnodes > 1)
-    {
-        nrnbTotalStorage = std::make_unique<t_nrnb>();
-        nrnb_tot         = nrnbTotalStorage.get();
-#if GMX_MPI
-        MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
-#endif
-    }
-    else
-    {
-        nrnb_tot = nrnb;
-    }
-
-    elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
-    elapsed_time_over_all_threads =
-            walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
-    if (cr->nnodes > 1)
-    {
-#if GMX_MPI
-        /* reduce elapsed_time over all MPI ranks in the current simulation */
-        MPI_Allreduce(&elapsed_time, &elapsed_time_over_all_ranks, 1, MPI_DOUBLE, MPI_SUM,
-                      cr->mpi_comm_mysim);
-        elapsed_time_over_all_ranks /= cr->nnodes;
-        /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
-         * current simulation. */
-        MPI_Allreduce(&elapsed_time_over_all_threads, &elapsed_time_over_all_threads_over_all_ranks,
-                      1, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
-#endif
-    }
-    else
-    {
-        elapsed_time_over_all_ranks                  = elapsed_time;
-        elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
-    }
-
-    if (printReport)
-    {
-        print_flop(fplog, nrnb_tot, &nbfs, &mflop);
-    }
-
-    if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
-    {
-        print_dd_statistics(cr, inputrec, fplog);
-    }
-
-    /* TODO Move the responsibility for any scaling by thread counts
-     * to the code that handled the thread region, so that there's a
-     * mechanism to keep cycle counting working during the transition
-     * to task parallelism. */
-    int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
-    int nthreads_pme = gmx_omp_nthreads_get(emntPME);
-    wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP),
-                                   nthreads_pp, nthreads_pme);
-    auto cycle_sum(wallcycle_sum(cr, wcycle));
-
-    if (printReport)
-    {
-        auto nbnxn_gpu_timings =
-                (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
-        gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
-
-        if (pme_gpu_task_enabled(pme))
-        {
-            pme_gpu_get_timings(pme, &pme_gpu_timings);
-        }
-        wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
-                        elapsed_time_over_all_ranks, wcycle, cycle_sum, nbnxn_gpu_timings,
-                        &pme_gpu_timings);
-
-        if (EI_DYNAMICS(inputrec->eI))
-        {
-            delta_t = inputrec->delta_t;
-        }
-
-        if (fplog)
-        {
-            print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
-                       walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
-                       delta_t, nbfs, mflop);
-        }
-        if (bWriteStat)
-        {
-            print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
-                       walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
-                       delta_t, nbfs, mflop);
-        }
-    }
-}
-
-int Mdrunner::mdrunner()
-{
-    matrix                    box;
-    t_forcerec*               fr               = nullptr;
-    real                      ewaldcoeff_q     = 0;
-    real                      ewaldcoeff_lj    = 0;
-    int                       nChargePerturbed = -1, nTypePerturbed = 0;
-    gmx_wallcycle_t           wcycle;
-    gmx_walltime_accounting_t walltime_accounting = nullptr;
-    MembedHolder              membedHolder(filenames.size(), filenames.data());
-
-    /* CAUTION: threads may be started later on in this function, so
-       cr doesn't reflect the final parallel state right now */
-    gmx_mtop_t mtop;
-
-    /* TODO: inputrec should tell us whether we use an algorithm, not a file option */
-    const bool doEssentialDynamics = opt2bSet("-ei", filenames.size(), filenames.data());
-    const bool doRerun             = mdrunOptions.rerun;
-
-    // Handle task-assignment related user options.
-    EmulateGpuNonbonded emulateGpuNonbonded =
-            (getenv("GMX_EMULATE_GPU") != nullptr ? EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
-
-    std::vector<int> userGpuTaskAssignment;
-    try
-    {
-        userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-    auto nonbondedTarget = findTaskTarget(nbpu_opt);
-    auto pmeTarget       = findTaskTarget(pme_opt);
-    auto pmeFftTarget    = findTaskTarget(pme_fft_opt);
-    auto bondedTarget    = findTaskTarget(bonded_opt);
-    auto updateTarget    = findTaskTarget(update_opt);
-
-    FILE* fplog = nullptr;
-    // If we are appending, we don't write log output because we need
-    // to check that the old log file matches what the checkpoint file
-    // expects. Otherwise, we should start to write log output now if
-    // there is a file ready for it.
-    if (logFileHandle != nullptr && startingBehavior != StartingBehavior::RestartWithAppending)
-    {
-        fplog = gmx_fio_getfp(logFileHandle);
-    }
-    const bool isSimulationMasterRank = findIsSimulationMasterRank(ms, simulationCommunicator);
-    gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
-    gmx::MDLogger    mdlog(logOwner.logger());
-
-    gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo_);
-
-    std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo_->deviceInfoList, hw_opt.gpuIdsAvailable);
-    const int        numDevicesToUse = gmx::ssize(gpuIdsToUse);
-
-    // Print citation requests after all software/hardware printing
-    pleaseCiteGromacs(fplog);
-
-    // Note: legacy program logic relies on checking whether these pointers are assigned.
-    // Objects may or may not be allocated later.
-    std::unique_ptr<t_inputrec> inputrec;
-    std::unique_ptr<t_state>    globalState;
-
-    auto partialDeserializedTpr = std::make_unique<PartialDeserializedTprFile>();
-
-    if (isSimulationMasterRank)
-    {
-        // Allocate objects to be initialized by later function calls.
-        /* Only the master rank has the global state */
-        globalState = std::make_unique<t_state>();
-        inputrec    = std::make_unique<t_inputrec>();
-
-        /* Read (nearly) all data required for the simulation
-         * and keep the partly serialized tpr contents to send to other ranks later
-         */
-        applyGlobalSimulationState(*inputHolder_.get(), partialDeserializedTpr.get(),
-                                   globalState.get(), inputrec.get(), &mtop);
-    }
-
-    /* Check and update the hardware options for internal consistency */
-    checkAndUpdateHardwareOptions(mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks,
-                                  inputrec.get());
-
-    if (GMX_THREAD_MPI && isSimulationMasterRank)
-    {
-        bool useGpuForNonbonded = false;
-        bool useGpuForPme       = false;
-        try
-        {
-            GMX_RELEASE_ASSERT(inputrec != nullptr, "Keep the compiler happy");
-
-            // If the user specified the number of ranks, then we must
-            // respect that, but in default mode, we need to allow for
-            // the number of GPUs to choose the number of ranks.
-            auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
-            useGpuForNonbonded         = decideWhetherToUseGpusForNonbondedWithThreadMpi(
-                    nonbondedTarget, numDevicesToUse, userGpuTaskAssignment, emulateGpuNonbonded,
-                    canUseGpuForNonbonded,
-                    gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
-                    hw_opt.nthreads_tmpi);
-            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(
-                    useGpuForNonbonded, pmeTarget, pmeFftTarget, numDevicesToUse, userGpuTaskAssignment,
-                    *hwinfo_, *inputrec, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
-        }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-
-        /* Determine how many thread-MPI ranks to start.
-         *
-         * TODO Over-writing the user-supplied value here does
-         * prevent any possible subsequent checks from working
-         * correctly. */
-        hw_opt.nthreads_tmpi =
-                get_nthreads_mpi(hwinfo_, &hw_opt, numDevicesToUse, useGpuForNonbonded, useGpuForPme,
-                                 inputrec.get(), &mtop, mdlog, membedHolder.doMembed());
-
-        // Now start the threads for thread MPI.
-        spawnThreads(hw_opt.nthreads_tmpi);
-        // The spawned threads enter mdrunner() and execution of
-        // master and spawned threads joins at the end of this block.
-    }
-
-    GMX_RELEASE_ASSERT(!GMX_MPI || ms || simulationCommunicator != MPI_COMM_NULL,
-                       "Must have valid communicator unless running a multi-simulation");
-    CommrecHandle crHandle = init_commrec(simulationCommunicator);
-    t_commrec*    cr       = crHandle.get();
-    GMX_RELEASE_ASSERT(cr != nullptr, "Must have valid commrec");
-
-    PhysicalNodeCommunicator physicalNodeComm(libraryWorldCommunicator, gmx_physicalnode_id_hash());
-
-    // If we detected the topology on this system, double-check that it makes sense
-    if (hwinfo_->hardwareTopology->isThisSystem())
-    {
-        hardwareTopologyDoubleCheckDetection(mdlog, *hwinfo_->hardwareTopology);
-    }
-
-    if (PAR(cr))
-    {
-        /* now broadcast everything to the non-master nodes/threads: */
-        if (!isSimulationMasterRank)
-        {
-            // Until now, only the master rank has a non-null pointer.
-            // On non-master ranks, allocate the object that will receive data in the following call.
-            inputrec = std::make_unique<t_inputrec>();
-        }
-        init_parallel(cr->mpiDefaultCommunicator, MASTER(cr), inputrec.get(), &mtop,
-                      partialDeserializedTpr.get());
-    }
-    GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
-    partialDeserializedTpr.reset(nullptr);
-
-    // Now the number of ranks is known to all ranks, and each knows
-    // the inputrec read by the master rank. The ranks can now all run
-    // the task-deciding functions and will agree on the result
-    // without needing to communicate.
-    const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
-
-    // Note that these variables describe only their own node.
-    //
-    // Note that when bonded interactions run on a GPU they always run
-    // alongside a nonbonded task, so do not influence task assignment
-    // even though they affect the force calculation workload.
-    bool useGpuForNonbonded = false;
-    bool useGpuForPme       = false;
-    bool useGpuForBonded    = false;
-    bool useGpuForUpdate    = false;
-    bool gpusWereDetected   = hwinfo_->ngpu_compatible_tot > 0;
-    try
-    {
-        // It's possible that there are different numbers of GPUs on
-        // different nodes, which is the user's responsibility to
-        // handle. If unsuitable, we will notice that during task
-        // assignment.
-        auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
-        useGpuForNonbonded         = decideWhetherToUseGpusForNonbonded(
-                nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded,
-                gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected);
-        useGpuForPme = decideWhetherToUseGpusForPme(
-                useGpuForNonbonded, pmeTarget, pmeFftTarget, userGpuTaskAssignment, *hwinfo_,
-                *inputrec, cr->sizeOfDefaultCommunicator, domdecOptions.numPmeRanks, gpusWereDetected);
-        useGpuForBonded = decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme,
-                                                          bondedTarget, *inputrec, mtop,
-                                                          domdecOptions.numPmeRanks, gpusWereDetected);
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-
-    const PmeRunMode pmeRunMode = determinePmeRunMode(useGpuForPme, pmeFftTarget, *inputrec);
-
-    // Initialize development feature flags that enabled by environment variable
-    // and report those features that are enabled.
-    const DevelopmentFeatureFlags devFlags =
-            manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
-
-    const bool useModularSimulator =
-            checkUseModularSimulator(false, inputrec.get(), doRerun, mtop, ms, replExParams,
-                                     nullptr, doEssentialDynamics, membedHolder.doMembed() && (plumedswitch==0) /* PLUMED */);
-
-    // Build restraints.
-    // TODO: hide restraint implementation details from Mdrunner.
-    // There is nothing unique about restraints at this point as far as the
-    // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
-    // factory functions from the SimulationContext on which to call mdModules_->add().
-    // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
-    for (auto&& restraint : restraintManager_->getRestraints())
-    {
-        auto module = RestraintMDModule::create(restraint, restraint->sites());
-        mdModules_->add(std::move(module));
-    }
-
-    // TODO: Error handling
-    mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
-    // now that the MdModules know their options, they know which callbacks to sign up to
-    mdModules_->subscribeToSimulationSetupNotifications();
-    const auto& mdModulesNotifier = mdModules_->notifier().simulationSetupNotifications_;
-
-    if (inputrec->internalParameters != nullptr)
-    {
-        mdModulesNotifier.notify(*inputrec->internalParameters);
-    }
-
-    if (fplog != nullptr)
-    {
-        pr_inputrec(fplog, 0, "Input Parameters", inputrec.get(), FALSE);
-        fprintf(fplog, "\n");
-    }
-
-    if (SIMMASTER(cr))
-    {
-        /* In rerun, set velocities to zero if present */
-        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
-        {
-            // rerun does not use velocities
-            GMX_LOG(mdlog.info)
-                    .asParagraph()
-                    .appendText(
-                            "Rerun trajectory contains velocities. Rerun does only evaluate "
-                            "potential energy and forces. The velocities will be ignored.");
-            for (int i = 0; i < globalState->natoms; i++)
-            {
-                clear_rvec(globalState->v[i]);
-            }
-            globalState->flags &= ~(1 << estV);
-        }
-
-        /* now make sure the state is initialized and propagated */
-        set_state_entries(globalState.get(), inputrec.get(), useModularSimulator);
-    }
-
-    /* NM and TPI parallelize over force/energy calculations, not atoms,
-     * so we need to initialize and broadcast the global state.
-     */
-    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
-    {
-        if (!MASTER(cr))
-        {
-            globalState = std::make_unique<t_state>();
-        }
-        broadcastStateWithoutDynamics(cr->mpiDefaultCommunicator, DOMAINDECOMP(cr), PAR(cr),
-                                      globalState.get());
-    }
-
-    /* A parallel command line option consistency check that we can
-       only do after any threads have started. */
-    if (!PAR(cr)
-        && (domdecOptions.numCells[XX] > 1 || domdecOptions.numCells[YY] > 1
-            || domdecOptions.numCells[ZZ] > 1 || domdecOptions.numPmeRanks > 0))
-    {
-        gmx_fatal(FARGS,
-                  "The -dd or -npme option request a parallel simulation, "
-#if !GMX_MPI
-                  "but %s was compiled without threads or MPI enabled",
-                  output_env_get_program_display_name(oenv));
-#elif GMX_THREAD_MPI
-                  "but the number of MPI-threads (option -ntmpi) is not set or is 1");
-#else
-                  "but %s was not started through mpirun/mpiexec or only one rank was requested "
-                  "through mpirun/mpiexec",
-                  output_env_get_program_display_name(oenv));
-#endif
-    }
-
-    if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
-    {
-        gmx_fatal(FARGS,
-                  "The .mdp file specified an energy mininization or normal mode algorithm, and "
-                  "these are not compatible with mdrun -rerun");
-    }
-
-    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
-    {
-        if (domdecOptions.numPmeRanks > 0)
-        {
-            gmx_fatal_collective(FARGS, cr->mpiDefaultCommunicator, MASTER(cr),
-                                 "PME-only ranks are requested, but the system does not use PME "
-                                 "for electrostatics or LJ");
-        }
-
-        domdecOptions.numPmeRanks = 0;
-    }
-
-    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
-    {
-        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
-         * improve performance with many threads per GPU, since our OpenMP
-         * scaling is bad, but it's difficult to automate the setup.
-         */
-        domdecOptions.numPmeRanks = 0;
-    }
-    if (useGpuForPme)
-    {
-        if (domdecOptions.numPmeRanks < 0)
-        {
-            domdecOptions.numPmeRanks = 0;
-            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
-        }
-        else
-        {
-            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1,
-                               "PME GPU decomposition is not supported");
-        }
-    }
-
-    /* NMR restraints must be initialized before load_checkpoint,
-     * since with time averaging the history is added to t_state.
-     * For proper consistency check we therefore need to extend
-     * t_state here.
-     * So the PME-only nodes (if present) will also initialize
-     * the distance restraints.
-     */
-
-    /* This needs to be called before read_checkpoint to extend the state */
-    t_disresdata* disresdata;
-    snew(disresdata, 1);
-    init_disres(fplog, &mtop, inputrec.get(), DisResRunMode::MDRun,
-                MASTER(cr) ? DDRole::Master : DDRole::Agent,
-                PAR(cr) ? NumRanks::Multiple : NumRanks::Single, cr->mpi_comm_mysim, ms, disresdata,
-                globalState.get(), replExParams.exchangeInterval > 0);
-
-    t_oriresdata* oriresdata;
-    snew(oriresdata, 1);
-    init_orires(fplog, &mtop, inputrec.get(), cr, ms, globalState.get(), oriresdata);
-
-    auto deform = prepareBoxDeformation(
-            globalState != nullptr ? globalState->box : box, MASTER(cr) ? DDRole::Master : DDRole::Agent,
-            PAR(cr) ? NumRanks::Multiple : NumRanks::Single, cr->mpi_comm_mygroup, *inputrec);
-
-#if GMX_FAHCORE
-    /* We have to remember the generation's first step before reading checkpoint.
-       This way, we can report to the F@H core both the generation's first step
-       and the restored first step, thus making it able to distinguish between
-       an interruption/resume and start of the n-th generation simulation.
-       Having this information, the F@H core can correctly calculate and report
-       the progress.
-     */
-    int gen_first_step = 0;
-    if (MASTER(cr))
-    {
-        gen_first_step = inputrec->init_step;
-    }
-#endif
-
-    ObservablesHistory observablesHistory = {};
-
-    auto modularSimulatorCheckpointData = std::make_unique<ReadCheckpointDataHolder>();
-    if (startingBehavior != StartingBehavior::NewSimulation)
-    {
-        /* Check if checkpoint file exists before doing continuation.
-         * This way we can use identical input options for the first and subsequent runs...
-         */
-        if (mdrunOptions.numStepsCommandline > -2)
-        {
-            /* Temporarily set the number of steps to unlimited to avoid
-             * triggering the nsteps check in load_checkpoint().
-             * This hack will go away soon when the -nsteps option is removed.
-             */
-            inputrec->nsteps = -1;
-        }
-
-        // Finish applying initial simulation state information from external sources on all ranks.
-        // Reconcile checkpoint file data with Mdrunner state established up to this point.
-        applyLocalState(*inputHolder_.get(), logFileHandle, cr, domdecOptions.numCells,
-                        inputrec.get(), globalState.get(), &observablesHistory,
-                        mdrunOptions.reproducible, mdModules_->notifier(),
-                        modularSimulatorCheckpointData.get(), useModularSimulator);
-        // TODO: (#3652) Synchronize filesystem state, SimulationInput contents, and program
-        //  invariants
-        //  on all code paths.
-        // Write checkpoint or provide hook to update SimulationInput.
-        // If there was a checkpoint file, SimulationInput contains more information
-        // than if there wasn't. At this point, we have synchronized the in-memory
-        // state with the filesystem state only for restarted simulations. We should
-        // be calling applyLocalState unconditionally and expect that the completeness
-        // of SimulationInput is not dependent on its creation method.
-
-        if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
-        {
-            // Now we can start normal logging to the truncated log file.
-            fplog = gmx_fio_getfp(logFileHandle);
-            prepareLogAppending(fplog);
-            logOwner = buildLogger(fplog, MASTER(cr));
-            mdlog    = logOwner.logger();
-        }
-    }
-
-#if GMX_FAHCORE
-    if (MASTER(cr))
-    {
-        fcRegisterSteps(inputrec->nsteps + inputrec->init_step, gen_first_step);
-    }
-#endif
-
-    if (mdrunOptions.numStepsCommandline > -2)
-    {
-        GMX_LOG(mdlog.info)
-                .asParagraph()
-                .appendText(
-                        "The -nsteps functionality is deprecated, and may be removed in a future "
-                        "version. "
-                        "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp "
-                        "file field.");
-    }
-    /* override nsteps with value set on the commandline */
-    override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec.get());
-
-    if (isSimulationMasterRank)
-    {
-        copy_mat(globalState->box, box);
-    }
-
-    if (PAR(cr))
-    {
-        gmx_bcast(sizeof(box), box, cr->mpiDefaultCommunicator);
-    }
-
-    if (inputrec->cutoff_scheme != ecutsVERLET)
-    {
-        gmx_fatal(FARGS,
-                  "This group-scheme .tpr file can no longer be run by mdrun. Please update to the "
-                  "Verlet scheme, or use an earlier version of GROMACS if necessary.");
-    }
-    /* Update rlist and nstlist. */
-    /* Note: prepare_verlet_scheme is calling increaseNstlist(...), which (while attempting to
-     * increase rlist) tries to check if the newly chosen value fits with the DD scheme. As this is
-     * run before any DD scheme is set up, this check is never executed. See #3334 for more details.
-     */
-    prepare_verlet_scheme(fplog, cr, inputrec.get(), nstlist_cmdline, &mtop, box,
-                          useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
-                          *hwinfo_->cpuInfo);
-
-    // This builder is necessary while we have multi-part construction
-    // of DD. Before DD is constructed, we use the existence of
-    // the builder object to indicate that further construction of DD
-    // is needed.
-    std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
-    if (useDomainDecomposition)
-    {
-        ddBuilder = std::make_unique<DomainDecompositionBuilder>(
-                mdlog, cr, domdecOptions, mdrunOptions, mtop, *inputrec, box,
-                positionsFromStatePointer(globalState.get()));
-    }
-    else
-    {
-        /* PME, if used, is done on all nodes with 1D decomposition */
-        cr->nnodes     = cr->sizeOfDefaultCommunicator;
-        cr->sim_nodeid = cr->rankInDefaultCommunicator;
-        cr->nodeid     = cr->rankInDefaultCommunicator;
-        cr->npmenodes  = 0;
-        cr->duty       = (DUTY_PP | DUTY_PME);
-
-        if (inputrec->pbcType == PbcType::Screw)
-        {
-            gmx_fatal(FARGS, "pbc=screw is only implemented with domain decomposition");
-        }
-    }
-
-    // Produce the task assignment for this rank - done after DD is constructed
-    GpuTaskAssignments gpuTaskAssignments = GpuTaskAssignmentsBuilder::build(
-            gpuIdsToUse, userGpuTaskAssignment, *hwinfo_, simulationCommunicator, physicalNodeComm,
-            nonbondedTarget, pmeTarget, bondedTarget, updateTarget, useGpuForNonbonded,
-            useGpuForPme, thisRankHasDuty(cr, DUTY_PP),
-            // TODO cr->duty & DUTY_PME should imply that a PME
-            // algorithm is active, but currently does not.
-            EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
-
-    // Get the device handles for the modules, nullptr when no task is assigned.
-    int                deviceId   = -1;
-    DeviceInformation* deviceInfo = gpuTaskAssignments.initDevice(&deviceId);
-
-    // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?)
-    bool useTiming = true;
-
-    if (GMX_GPU_CUDA)
-    {
-        /* WARNING: CUDA timings are incorrect with multiple streams.
-         *          This is the main reason why they are disabled by default.
-         */
-        // TODO: Consider turning on by default when we can detect nr of streams.
-        useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
-    }
-    else if (GMX_GPU_OPENCL)
-    {
-        useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
-    }
-
-    // TODO Currently this is always built, yet DD partition code
-    // checks if it is built before using it. Probably it should
-    // become an MDModule that is made only when another module
-    // requires it (e.g. pull, CompEl, density fitting), so that we
-    // don't update the local atom sets unilaterally every step.
-    LocalAtomSetManager atomSets;
-    if (ddBuilder)
-    {
-        // TODO Pass the GPU streams to ddBuilder to use in buffer
-        // transfers (e.g. halo exchange)
-        cr->dd = ddBuilder->build(&atomSets);
-        // The builder's job is done, so destruct it
-        ddBuilder.reset(nullptr);
-        // Note that local state still does not exist yet.
-    }
-    // Ensure that all atoms within the same update group are in the
-    // same periodic image. Otherwise, a simulation that did not use
-    // update groups (e.g. a single-rank simulation) cannot always be
-    // correctly restarted in a way that does use update groups
-    // (e.g. a multi-rank simulation).
-    if (isSimulationMasterRank)
-    {
-        const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
-        if (useUpdateGroups)
-        {
-            putUpdateGroupAtomsInSamePeriodicImage(*cr->dd, mtop, globalState->box, globalState->x);
-        }
-    }
-
-    // The GPU update is decided here because we need to know whether the constraints or
-    // SETTLEs can span accross the domain borders (i.e. whether or not update groups are
-    // defined). This is only known after DD is initialized, hence decision on using GPU
-    // update is done so late.
-    try
-    {
-        const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
-        const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
-
-        useGpuForUpdate = decideWhetherToUseGpuForUpdate(
-                useDomainDecomposition, useUpdateGroups, pmeRunMode, domdecOptions.numPmeRanks > 0,
-                useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop,
-                doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
-                replExParams.exchangeInterval > 0, haveFrozenAtoms, doRerun, devFlags, mdlog);
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-
-    const bool printHostName = (cr->nnodes > 1);
-    gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
-
-    const bool disableNonbondedCalculation = (getenv("GMX_NO_NONBONDED") != nullptr);
-    if (disableNonbondedCalculation)
-    {
-        /* turn off non-bonded calculations */
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendText(
-                        "Found environment variable GMX_NO_NONBONDED.\n"
-                        "Disabling nonbonded calculations.");
-    }
-
-    MdrunScheduleWorkload runScheduleWork;
-
-    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(
-            devFlags, havePPDomainDecomposition(cr), useGpuForNonbonded, useModularSimulator,
-            doRerun, EI_ENERGY_MINIMIZATION(inputrec->eI));
-
-    // Also populates the simulation constant workload description.
-    runScheduleWork.simulationWork = createSimulationWorkload(
-            *inputrec, disableNonbondedCalculation, devFlags, useGpuForNonbonded, pmeRunMode,
-            useGpuForBonded, useGpuForUpdate, useGpuDirectHalo);
-
-    std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
-
-    if (deviceInfo != nullptr)
-    {
-        if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
-        {
-            dd_setup_dlb_resource_sharing(cr, deviceId);
-        }
-        deviceStreamManager = std::make_unique<DeviceStreamManager>(
-                *deviceInfo, havePPDomainDecomposition(cr), runScheduleWork.simulationWork, useTiming);
-    }
-
-    // If the user chose a task assignment, give them some hints
-    // where appropriate.
-    if (!userGpuTaskAssignment.empty())
-    {
-        gpuTaskAssignments.logPerformanceHints(mdlog, numDevicesToUse);
-    }
-
-    if (PAR(cr))
-    {
-        /* After possible communicator splitting in make_dd_communicators.
-         * we can set up the intra/inter node communication.
-         */
-        gmx_setup_nodecomm(fplog, cr);
-    }
-
-#if GMX_MPI
-    if (isMultiSim(ms))
-    {
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendTextFormatted(
-                        "This is simulation %d out of %d running as a composite GROMACS\n"
-                        "multi-simulation job. Setup for this simulation:\n",
-                        ms->simulationIndex_, ms->numSimulations_);
-    }
-    GMX_LOG(mdlog.warning)
-            .appendTextFormatted("Using %d MPI %s\n", cr->nnodes,
-#    if GMX_THREAD_MPI
-                                 cr->nnodes == 1 ? "thread" : "threads"
-#    else
-                                 cr->nnodes == 1 ? "process" : "processes"
-#    endif
-            );
-    fflush(stderr);
-#endif
-
-    // If mdrun -pin auto honors any affinity setting that already
-    // exists. If so, it is nice to provide feedback about whether
-    // that existing affinity setting was from OpenMP or something
-    // else, so we run this code both before and after we initialize
-    // the OpenMP support.
-    gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, FALSE);
-    /* Check and update the number of OpenMP threads requested */
-    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo_, cr, ms, physicalNodeComm.size_,
-                                            pmeRunMode, mtop, *inputrec);
-
-    gmx_omp_nthreads_init(mdlog, cr, hwinfo_->nthreads_hw_avail, physicalNodeComm.size_,
-                          hw_opt.nthreads_omp, hw_opt.nthreads_omp_pme, !thisRankHasDuty(cr, DUTY_PP));
-
-    // Enable FP exception detection, but not in
-    // Release mode and not for compilers with known buggy FP
-    // exception support (clang with any optimization) or suspected
-    // buggy FP exception support (gcc 7.* with optimization).
-#if !defined NDEBUG                                                                         \
-        && !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
-             && defined __OPTIMIZE__)
-    const bool bEnableFPE = true;
-#else
-    const bool bEnableFPE = false;
-#endif
-    // FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
-    if (bEnableFPE)
-    {
-        gmx_feenableexcept();
-    }
-
-    /* Now that we know the setup is consistent, check for efficiency */
-    check_resource_division_efficiency(hwinfo_, gpuTaskAssignments.thisRankHasAnyGpuTask(),
-                                       mdrunOptions.ntompOptionIsSet, cr, mdlog);
-
-    /* getting number of PP/PME threads on this MPI / tMPI rank.
-       PME: env variable should be read only on one node to make sure it is
-       identical everywhere;
-     */
-    const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded)
-                                                                  : gmx_omp_nthreads_get(emntPME);
-    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid, *hwinfo_->hardwareTopology,
-                                  physicalNodeComm, mdlog);
-
-    // Enable Peer access between GPUs where available
-    // Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
-    // any of the GPU communication features are active.
-    if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
-        && (runScheduleWork.simulationWork.useGpuHaloExchange
-            || runScheduleWork.simulationWork.useGpuPmePpCommunication))
-    {
-        setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
-    }
-
-    if (hw_opt.threadAffinity != ThreadAffinity::Off)
-    {
-        /* Before setting affinity, check whether the affinity has changed
-         * - which indicates that probably the OpenMP library has changed it
-         * since we first checked).
-         */
-        gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, TRUE);
-
-        int numThreadsOnThisNode, intraNodeThreadOffset;
-        analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
-                                 &intraNodeThreadOffset);
-
-        /* Set the CPU affinity */
-        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo_->hardwareTopology, numThreadsOnThisRank,
-                                numThreadsOnThisNode, intraNodeThreadOffset, nullptr);
-    }
-
-    if (mdrunOptions.timingOptions.resetStep > -1)
-    {
-        GMX_LOG(mdlog.info)
-                .asParagraph()
-                .appendText(
-                        "The -resetstep functionality is deprecated, and may be removed in a "
-                        "future version.");
-    }
-    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
-
-    if (PAR(cr))
-    {
-        /* Master synchronizes its value of reset_counters with all nodes
-         * including PME only nodes */
-        int64_t reset_counters = wcycle_get_reset_counters(wcycle);
-        gmx_bcast(sizeof(reset_counters), &reset_counters, cr->mpi_comm_mysim);
-        wcycle_set_reset_counters(wcycle, reset_counters);
-    }
-
-    // Membrane embedding must be initialized before we call init_forcerec()
-    membedHolder.initializeMembed(fplog, filenames.size(), filenames.data(), &mtop, inputrec.get(),
-                                  globalState.get(), cr, &mdrunOptions.checkpointOptions.period);
-
-    const bool               thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
-    std::unique_ptr<MDAtoms> mdAtoms;
-    std::unique_ptr<VirtualSitesHandler> vsite;
-    std::unique_ptr<GpuBonded>           gpuBonded;
-
-    t_nrnb nrnb;
-    if (thisRankHasDuty(cr, DUTY_PP))
-    {
-        mdModulesNotifier.notify(*cr);
-        mdModulesNotifier.notify(&atomSets);
-        mdModulesNotifier.notify(inputrec->pbcType);
-        mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
-        /* Initiate forcerecord */
-        fr                 = new t_forcerec;
-        fr->forceProviders = mdModules_->initForceProviders();
-        init_forcerec(fplog, mdlog, fr, inputrec.get(), &mtop, cr, box,
-                      opt2fn("-table", filenames.size(), filenames.data()),
-                      opt2fn("-tablep", filenames.size(), filenames.data()),
-                      opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
-        // Dirty hack, for fixing disres and orires should be made mdmodules
-        fr->fcdata->disres = disresdata;
-        fr->fcdata->orires = oriresdata;
-
-        // Save a handle to device stream manager to use elsewhere in the code
-        // TODO: Forcerec is not a correct place to store it.
-        fr->deviceStreamManager = deviceStreamManager.get();
-
-        if (runScheduleWork.simulationWork.useGpuPmePpCommunication && !thisRankHasDuty(cr, DUTY_PME))
-        {
-            GMX_RELEASE_ASSERT(
-                    deviceStreamManager != nullptr,
-                    "GPU device stream manager should be valid in order to use PME-PP direct "
-                    "communications.");
-            GMX_RELEASE_ASSERT(
-                    deviceStreamManager->streamIsValid(DeviceStreamType::PmePpTransfer),
-                    "GPU PP-PME stream should be valid in order to use GPU PME-PP direct "
-                    "communications.");
-            fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
-                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, deviceStreamManager->context(),
-                    deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
-        }
-
-        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec.get(), fr, cr, *hwinfo_,
-                                        runScheduleWork.simulationWork.useGpuNonbonded,
-                                        deviceStreamManager.get(), &mtop, box, wcycle);
-        // TODO: Move the logic below to a GPU bonded builder
-        if (runScheduleWork.simulationWork.useGpuBonded)
-        {
-            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
-                               "GPU device stream manager should be valid in order to use GPU "
-                               "version of bonded forces.");
-            gpuBonded = std::make_unique<GpuBonded>(
-                    mtop.ffparams, fr->ic->epsfac * fr->fudgeQQ, deviceStreamManager->context(),
-                    deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)), wcycle);
-            fr->gpuBonded = gpuBonded.get();
-        }
-
-        /* Initialize the mdAtoms structure.
-         * mdAtoms is not filled with atom data,
-         * as this can not be done now with domain decomposition.
-         */
-        mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
-        if (globalState && thisRankHasPmeGpuTask)
-        {
-            // The pinning of coordinates in the global state object works, because we only use
-            // PME on GPU without DD or on a separate PME rank, and because the local state pointer
-            // points to the global state object without DD.
-            // FIXME: MD and EM separately set up the local state - this should happen in the same
-            // function, which should also perform the pinning.
-            changePinningPolicy(&globalState->x, pme_get_pinning_policy());
-        }
-
-        /* Initialize the virtual site communication */
-        vsite = makeVirtualSitesHandler(mtop, cr, fr->pbcType);
-
-        calc_shifts(box, fr->shift_vec);
-
-        /* With periodic molecules the charge groups should be whole at start up
-         * and the virtual sites should not be far from their proper positions.
-         */
-        if (!inputrec->bContinuation && MASTER(cr)
-            && !(inputrec->pbcType != PbcType::No && inputrec->bPeriodicMols))
-        {
-            /* Make molecules whole at start of run */
-            if (fr->pbcType != PbcType::No)
-            {
-                do_pbc_first_mtop(fplog, inputrec->pbcType, box, &mtop, globalState->x.rvec_array());
-            }
-            if (vsite)
-            {
-                /* Correct initial vsite positions are required
-                 * for the initial distribution in the domain decomposition
-                 * and for the initial shell prediction.
-                 */
-                constructVirtualSitesGlobal(mtop, globalState->x);
-            }
-        }
-
-        if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
-        {
-            ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
-            ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-        }
-    }
-    else
-    {
-        /* This is a PME only node */
-
-        GMX_ASSERT(globalState == nullptr,
-                   "We don't need the state on a PME only rank and expect it to be unitialized");
-
-        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
-        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
-    }
-
-    gmx_pme_t* sepPmeData = nullptr;
-    // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
-    GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr),
-               "Double-checking that only PME-only ranks have no forcerec");
-    gmx_pme_t*& pmedata = fr ? fr->pmedata : sepPmeData;
-
-    // TODO should live in ewald module once its testing is improved
-    //
-    // Later, this program could contain kernels that might be later
-    // re-used as auto-tuning progresses, or subsequent simulations
-    // are invoked.
-    PmeGpuProgramStorage pmeGpuProgram;
-    if (thisRankHasPmeGpuTask)
-    {
-        GMX_RELEASE_ASSERT(
-                (deviceStreamManager != nullptr),
-                "GPU device stream manager should be initialized in order to use GPU for PME.");
-        GMX_RELEASE_ASSERT((deviceInfo != nullptr),
-                           "GPU device should be initialized in order to use GPU for PME.");
-        pmeGpuProgram = buildPmeGpuProgram(deviceStreamManager->context());
-    }
-
-    /* Initiate PME if necessary,
-     * either on all nodes or on dedicated PME nodes only. */
-    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
-    {
-        if (mdAtoms && mdAtoms->mdatoms())
-        {
-            nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
-            if (EVDW_PME(inputrec->vdwtype))
-            {
-                nTypePerturbed = mdAtoms->mdatoms()->nTypePerturbed;
-            }
-        }
-        if (cr->npmenodes > 0)
-        {
-            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
-            gmx_bcast(sizeof(nChargePerturbed), &nChargePerturbed, cr->mpi_comm_mysim);
-            gmx_bcast(sizeof(nTypePerturbed), &nTypePerturbed, cr->mpi_comm_mysim);
-        }
-
-        if (thisRankHasDuty(cr, DUTY_PME))
-        {
-            try
-            {
-                // TODO: This should be in the builder.
-                GMX_RELEASE_ASSERT(!runScheduleWork.simulationWork.useGpuPme
-                                           || (deviceStreamManager != nullptr),
-                                   "Device stream manager should be valid in order to use GPU "
-                                   "version of PME.");
-                GMX_RELEASE_ASSERT(
-                        !runScheduleWork.simulationWork.useGpuPme
-                                || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
-                        "GPU PME stream should be valid in order to use GPU version of PME.");
-
-                const DeviceContext* deviceContext = runScheduleWork.simulationWork.useGpuPme
-                                                             ? &deviceStreamManager->context()
-                                                             : nullptr;
-                const DeviceStream* pmeStream =
-                        runScheduleWork.simulationWork.useGpuPme
-                                ? &deviceStreamManager->stream(DeviceStreamType::Pme)
-                                : nullptr;
-
-                pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec.get(),
-                                       nChargePerturbed != 0, nTypePerturbed != 0,
-                                       mdrunOptions.reproducible, ewaldcoeff_q, ewaldcoeff_lj,
-                                       gmx_omp_nthreads_get(emntPME), pmeRunMode, nullptr,
-                                       deviceContext, pmeStream, pmeGpuProgram.get(), mdlog);
-            }
-            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-        }
-    }
-
-
-    if (EI_DYNAMICS(inputrec->eI))
-    {
-        /* Turn on signal handling on all nodes */
-        /*
-         * (A user signal from the PME nodes (if any)
-         * is communicated to the PP nodes.
-         */
-        signal_handler_install();
-    }
-
-    pull_t* pull_work = nullptr;
-    if (thisRankHasDuty(cr, DUTY_PP))
-    {
-        /* Assumes uniform use of the number of OpenMP threads */
-        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
-
-        if (inputrec->bPull)
-        {
-            /* Initialize pull code */
-            pull_work = init_pull(fplog, inputrec->pull.get(), inputrec.get(), &mtop, cr, &atomSets,
-                                  inputrec->fepvals->init_lambda);
-            if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
-            {
-                initPullHistory(pull_work, &observablesHistory);
-            }
-            if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
-            {
-                init_pull_output_files(pull_work, filenames.size(), filenames.data(), oenv, startingBehavior);
-            }
-        }
-
-        std::unique_ptr<EnforcedRotation> enforcedRotation;
-        if (inputrec->bRot)
-        {
-            /* Initialize enforced rotation code */
-            enforcedRotation = init_rot(fplog, inputrec.get(), filenames.size(), filenames.data(),
-                                        cr, &atomSets, globalState.get(), &mtop, oenv, mdrunOptions,
-                                        startingBehavior);
-        }
-
-        t_swap* swap = nullptr;
-        if (inputrec->eSwapCoords != eswapNO)
-        {
-            /* Initialize ion swapping code */
-            swap = init_swapcoords(fplog, inputrec.get(),
-                                   opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
-                                   &mtop, globalState.get(), &observablesHistory, cr, &atomSets,
-                                   oenv, mdrunOptions, startingBehavior);
-        }
-
-        /* Let makeConstraints know whether we have essential dynamics constraints. */
-        auto constr = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr,
-                                      ms, &nrnb, wcycle, fr->bMolPBC);
-
-        /* Energy terms and groups */
-        gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
-                             inputrec->fepvals->n_lambda);
-
-        // cos acceleration is only supported by md, but older tpr
-        // files might still combine it with other integrators
-        GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == eiMD,
-                           "cos_acceleration is only supported by integrator=md");
-
-        /* Kinetic energy data */
-        gmx_ekindata_t ekind;
-        init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
-
-        /* Set up interactive MD (IMD) */
-        auto imdSession =
-                makeImdSession(inputrec.get(), cr, wcycle, &enerd, ms, &mtop, mdlog,
-                               MASTER(cr) ? globalState->x.rvec_array() : nullptr, filenames.size(),
-                               filenames.data(), oenv, mdrunOptions.imdOptions, startingBehavior);
-
-        if (DOMAINDECOMP(cr))
-        {
-            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
-            /* This call is not included in init_domain_decomposition mainly
-             * because fr->cginfo_mb is set later.
-             */
-            dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec.get(),
-                            domdecOptions.checkBondedInteractions, fr->cginfo_mb);
-        }
-
-        if (runScheduleWork.simulationWork.useGpuBufferOps)
-        {
-            fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
-                    deviceStreamManager->context(),
-                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), wcycle);
-            fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
-                    deviceStreamManager->context(),
-                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), wcycle);
-        }
-
-        std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
-        if (gpusWereDetected
-            && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
-                || runScheduleWork.simulationWork.useGpuBufferOps))
-        {
-            GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
-                                                      ? GpuApiCallBehavior::Async
-                                                      : GpuApiCallBehavior::Sync;
-            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
-                               "GPU device stream manager should be initialized to use GPU.");
-            stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
-            fr->stateGpu = stateGpu.get();
-        }
-
-        GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
-        SimulatorBuilder simulatorBuilder;
-
-        simulatorBuilder.add(SimulatorStateData(globalState.get(), &observablesHistory, &enerd, &ekind));
-        simulatorBuilder.add(std::move(membedHolder));
-        simulatorBuilder.add(std::move(stopHandlerBuilder_));
-        simulatorBuilder.add(SimulatorConfig(mdrunOptions, startingBehavior, &runScheduleWork));
-
-
-        simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv));
-        simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle));
-        simulatorBuilder.add(ConstraintsParam(
-                constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
-                vsite.get()));
-        // TODO: Separate `fr` to a separate add, and make the `build` handle the coupling sensibly.
-        simulatorBuilder.add(LegacyInput(static_cast<int>(filenames.size()), filenames.data(),
-                                         inputrec.get(), fr));
-        simulatorBuilder.add(ReplicaExchangeParameters(replExParams));
-        simulatorBuilder.add(InteractiveMD(imdSession.get()));
-        simulatorBuilder.add(SimulatorModules(mdModules_->outputProvider(), mdModules_->notifier()));
-        simulatorBuilder.add(CenterOfMassPulling(pull_work));
-        // Todo move to an MDModule
-        simulatorBuilder.add(IonSwapping(swap));
-        simulatorBuilder.add(TopologyData(&mtop, mdAtoms.get()));
-        simulatorBuilder.add(BoxDeformationHandle(deform.get()));
-        simulatorBuilder.add(std::move(modularSimulatorCheckpointData));
-
-        /* PLUMED */
-        if(plumedswitch){
-          if(useModularSimulator) gmx_fatal(FARGS, "PLUMED is not yet compatible with GROMACS new modular simulator");
-          /* detect plumed API version */
-          int pversion=0;
-          plumed_cmd(plumedmain,"getApiVersion",&pversion);
-          if(pversion>5) {
-             int nth = gmx_omp_nthreads_get(emntDefault);
-             plumed_cmd(plumedmain,"setNumOMPthreads",&nth);
-          }
-          /* set GPU device id */
-          if(pversion>9) {
-             plumed_cmd(plumedmain,"setGpuDeviceId", &deviceId);
-          }
-          if(useGpuForUpdate) {
-             GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendTextFormatted(
-                        "This simulation is resident on GPU (-update gpu)\n"
-                        "but also runs PLUMED (-plumed ). Unless plumed actions are performed\n" 
-                        "only on neighbour list search and/or file writing steps, this will lead to WRONG RESULTS.\n" 
-                        "Stop it and run it again with -update cpu.\n");
-          } 
-        }
-        /* END PLUMED */
-
-        // build and run simulator object based on user-input
-        auto simulator = simulatorBuilder.build(useModularSimulator);
-        simulator->run();
-
-        if (fr->pmePpCommGpu)
-        {
-            // destroy object since it is no longer required. (This needs to be done while the GPU context still exists.)
-            fr->pmePpCommGpu.reset();
-        }
-
-        if (inputrec->bPull)
-        {
-            finish_pull(pull_work);
-        }
-        finish_swapcoords(swap);
-    }
-    else
-    {
-        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
-        /* do PME only */
-        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
-        gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec.get(), pmeRunMode,
-                    deviceStreamManager.get());
-    }
-
-    wallcycle_stop(wcycle, ewcRUN);
-
-    /* Finish up, write some stuff
-     * if rerunMD, don't write last frame again
-     */
-    finish_run(fplog, mdlog, cr, inputrec.get(), &nrnb, wcycle, walltime_accounting,
-               fr ? fr->nbv.get() : nullptr, pmedata, EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
-
-    // clean up cycle counter
-    wallcycle_destroy(wcycle);
-
-    deviceStreamManager.reset(nullptr);
-    // Free PME data
-    if (pmedata)
-    {
-        gmx_pme_destroy(pmedata);
-        pmedata = nullptr;
-    }
-
-    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
-    // before we destroy the GPU context(s)
-    // Pinned buffers are associated with contexts in CUDA.
-    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
-    mdAtoms.reset(nullptr);
-    globalState.reset(nullptr);
-    mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
-    gpuBonded.reset(nullptr);
-    /* Free pinned buffers in *fr */
-    delete fr;
-    fr = nullptr;
-    // TODO convert to C++ so we can get rid of these frees
-    sfree(disresdata);
-    sfree(oriresdata);
-
-    if (!hwinfo_->deviceInfoList.empty())
-    {
-        /* stop the GPU profiler (only CUDA) */
-        stopGpuProfiler();
-    }
-
-    /* With tMPI we need to wait for all ranks to finish deallocation before
-     * destroying the CUDA context as some tMPI ranks may be sharing
-     * GPU and context.
-     *
-     * This is not a concern in OpenCL where we use one context per rank.
-     *
-     * Note: it is safe to not call the barrier on the ranks which do not use GPU,
-     * but it is easier and more futureproof to call it on the whole node.
-     *
-     * Note that this function needs to be called even if GPUs are not used
-     * in this run because the PME ranks have no knowledge of whether GPUs
-     * are used or not, but all ranks need to enter the barrier below.
-     * \todo Remove this physical node barrier after making sure
-     * that it's not needed anymore (with a shared GPU run).
-     */
-    if (GMX_THREAD_MPI)
-    {
-        physicalNodeComm.barrier();
-    }
-    releaseDevice(deviceInfo);
-
-    /* Does what it says */
-    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
-    walltime_accounting_destroy(walltime_accounting);
-
-    /* PLUMED */
-    if(plumedswitch){
-      plumed_finalize(plumedmain);
-    }
-    /* END PLUMED */
-
-    // Ensure log file content is written
-    if (logFileHandle)
-    {
-        gmx_fio_flush(logFileHandle);
-    }
-
-    /* Reset FPEs (important for unit tests) by disabling them. Assumes no
-     * exceptions were enabled before function was called. */
-    if (bEnableFPE)
-    {
-        gmx_fedisableexcept();
-    }
-
-    auto rc = static_cast<int>(gmx_get_stop_condition());
-
-#if GMX_THREAD_MPI
-    /* we need to join all threads. The sub-threads join when they
-       exit this function, but the master thread needs to be told to
-       wait for that. */
-    if (MASTER(cr))
-    {
-        tMPI_Finalize();
-    }
-#endif
-    return rc;
-} // namespace gmx
-
-Mdrunner::~Mdrunner()
-{
-    // Clean up of the Manager.
-    // This will end up getting called on every thread-MPI rank, which is unnecessary,
-    // but okay as long as threads synchronize some time before adding or accessing
-    // a new set of restraints.
-    if (restraintManager_)
-    {
-        restraintManager_->clear();
-        GMX_ASSERT(restraintManager_->countRestraints() == 0,
-                   "restraints added during runner life time should be cleared at runner "
-                   "destruction.");
-    }
-};
-
-void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller, const std::string& name)
-{
-    GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
-    // Not sure if this should be logged through the md logger or something else,
-    // but it is helpful to have some sort of INFO level message sent somewhere.
-    //    std::cout << "Registering restraint named " << name << std::endl;
-
-    // When multiple restraints are used, it may be wasteful to register them separately.
-    // Maybe instead register an entire Restraint Manager as a force provider.
-    restraintManager_->addToSpec(std::move(puller), name);
-}
-
-Mdrunner::Mdrunner(std::unique_ptr<MDModules> mdModules) : mdModules_(std::move(mdModules)) {}
-
-Mdrunner::Mdrunner(Mdrunner&&) noexcept = default;
-
-//NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265 in CentOS 7
-Mdrunner& Mdrunner::operator=(Mdrunner&& /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
-
-class Mdrunner::BuilderImplementation
-{
-public:
-    BuilderImplementation() = delete;
-    BuilderImplementation(std::unique_ptr<MDModules> mdModules, compat::not_null<SimulationContext*> context);
-    ~BuilderImplementation();
-
-    BuilderImplementation& setExtraMdrunOptions(const MdrunOptions& options,
-                                                real                forceWarningThreshold,
-                                                StartingBehavior    startingBehavior);
-
-    void addHardwareDetectionResult(const gmx_hw_info_t* hwinfo);
-
-    void addDomdec(const DomdecOptions& options);
-
-    void addInput(SimulationInputHandle inputHolder);
-
-    void addVerletList(int nstlist);
-
-    void addReplicaExchange(const ReplicaExchangeParameters& params);
-
-    void addNonBonded(const char* nbpu_opt);
-
-    void addPME(const char* pme_opt_, const char* pme_fft_opt_);
-
-    void addBondedTaskAssignment(const char* bonded_opt);
-
-    void addUpdateTaskAssignment(const char* update_opt);
-
-    void addHardwareOptions(const gmx_hw_opt_t& hardwareOptions);
-
-    void addFilenames(ArrayRef<const t_filenm> filenames);
-
-    void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
-
-    void addLogFile(t_fileio* logFileHandle);
-
-    void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
-
-    Mdrunner build();
-
-private:
-    // Default parameters copied from runner.h
-    // \todo Clarify source(s) of default parameters.
-
-    const char* nbpu_opt_    = nullptr;
-    const char* pme_opt_     = nullptr;
-    const char* pme_fft_opt_ = nullptr;
-    const char* bonded_opt_  = nullptr;
-    const char* update_opt_  = nullptr;
-
-    MdrunOptions mdrunOptions_;
-
-    DomdecOptions domdecOptions_;
-
-    ReplicaExchangeParameters replicaExchangeParameters_;
-
-    //! Command-line override for the duration of a neighbor list with the Verlet scheme.
-    int nstlist_ = 0;
-
-    //! World communicator, used for hardware detection and task assignment
-    MPI_Comm libraryWorldCommunicator_ = MPI_COMM_NULL;
-
-    //! Multisim communicator handle.
-    gmx_multisim_t* multiSimulation_;
-
-    //! mdrun communicator
-    MPI_Comm simulationCommunicator_ = MPI_COMM_NULL;
-
-    //! Print a warning if any force is larger than this (in kJ/mol nm).
-    real forceWarningThreshold_ = -1;
-
-    //! Whether the simulation will start afresh, or restart with/without appending.
-    StartingBehavior startingBehavior_ = StartingBehavior::NewSimulation;
-
-    //! The modules that comprise the functionality of mdrun.
-    std::unique_ptr<MDModules> mdModules_;
-
-    //! Detected hardware.
-    const gmx_hw_info_t* hwinfo_ = nullptr;
-
-    //! \brief Parallelism information.
-    gmx_hw_opt_t hardwareOptions_;
-
-    //! filename options for simulation.
-    ArrayRef<const t_filenm> filenames_;
-
-    /*! \brief Handle to output environment.
-     *
-     * \todo gmx_output_env_t needs lifetime management.
-     */
-    gmx_output_env_t* outputEnvironment_ = nullptr;
-
-    /*! \brief Non-owning handle to MD log file.
-     *
-     * \todo Context should own output facilities for client.
-     * \todo Improve log file handle management.
-     * \internal
-     * Code managing the FILE* relies on the ability to set it to
-     * nullptr to check whether the filehandle is valid.
-     */
-    t_fileio* logFileHandle_ = nullptr;
-
-    /*!
-     * \brief Builder for simulation stop signal handler.
-     */
-    std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
-
-    /*!
-     * \brief Sources for initial simulation state.
-     *
-     * See issue #3652 for near-term refinements to the SimulationInput interface.
-     *
-     * See issue #3379 for broader discussion on API aspects of simulation inputs and outputs.
-     */
-    SimulationInputHandle inputHolder_;
-};
-
-Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules> mdModules,
-                                                       compat::not_null<SimulationContext*> context) :
-    mdModules_(std::move(mdModules))
-{
-    libraryWorldCommunicator_ = context->libraryWorldCommunicator_;
-    simulationCommunicator_   = context->simulationCommunicator_;
-    multiSimulation_          = context->multiSimulation_.get();
-}
-
-Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
-
-Mdrunner::BuilderImplementation&
-Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions&    options,
-                                                      const real             forceWarningThreshold,
-                                                      const StartingBehavior startingBehavior)
-{
-    mdrunOptions_          = options;
-    forceWarningThreshold_ = forceWarningThreshold;
-    startingBehavior_      = startingBehavior;
-    return *this;
-}
-
-void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions& options)
-{
-    domdecOptions_ = options;
-}
-
-void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
-{
-    nstlist_ = nstlist;
-}
-
-void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters& params)
-{
-    replicaExchangeParameters_ = params;
-}
-
-Mdrunner Mdrunner::BuilderImplementation::build()
-{
-    auto newRunner = Mdrunner(std::move(mdModules_));
-
-    newRunner.mdrunOptions     = mdrunOptions_;
-    newRunner.pforce           = forceWarningThreshold_;
-    newRunner.startingBehavior = startingBehavior_;
-    newRunner.domdecOptions    = domdecOptions_;
-
-    // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
-    newRunner.hw_opt = hardwareOptions_;
-
-    // No invariant to check. This parameter exists to optionally override other behavior.
-    newRunner.nstlist_cmdline = nstlist_;
-
-    newRunner.replExParams = replicaExchangeParameters_;
-
-    newRunner.filenames = filenames_;
-
-    newRunner.libraryWorldCommunicator = libraryWorldCommunicator_;
-
-    newRunner.simulationCommunicator = simulationCommunicator_;
-
-    // nullptr is a valid value for the multisim handle
-    newRunner.ms = multiSimulation_;
-
-    if (hwinfo_)
-    {
-        newRunner.hwinfo_ = hwinfo_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError(
-                "MdrunnerBuilder::addHardwareDetectionResult() is required before build()"));
-    }
-
-    if (inputHolder_)
-    {
-        newRunner.inputHolder_ = std::move(inputHolder_);
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError("MdrunnerBuilder::addInput() is required before build()."));
-    }
-
-    // \todo Clarify ownership and lifetime management for gmx_output_env_t
-    // \todo Update sanity checking when output environment has clearly specified invariants.
-    // Initialization and default values for oenv are not well specified in the current version.
-    if (outputEnvironment_)
-    {
-        newRunner.oenv = outputEnvironment_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError(
-                "MdrunnerBuilder::addOutputEnvironment() is required before build()"));
-    }
-
-    newRunner.logFileHandle = logFileHandle_;
-
-    if (nbpu_opt_)
-    {
-        newRunner.nbpu_opt = nbpu_opt_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
-    }
-
-    if (pme_opt_ && pme_fft_opt_)
-    {
-        newRunner.pme_opt     = pme_opt_;
-        newRunner.pme_fft_opt = pme_fft_opt_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
-    }
-
-    if (bonded_opt_)
-    {
-        newRunner.bonded_opt = bonded_opt_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError(
-                "MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
-    }
-
-    if (update_opt_)
-    {
-        newRunner.update_opt = update_opt_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError(
-                "MdrunnerBuilder::addUpdateTaskAssignment() is required before build()  "));
-    }
-
-
-    newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
-
-    if (stopHandlerBuilder_)
-    {
-        newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
-    }
-    else
-    {
-        newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
-    }
-
-    return newRunner;
-}
-
-void Mdrunner::BuilderImplementation::addHardwareDetectionResult(const gmx_hw_info_t* hwinfo)
-{
-    hwinfo_ = hwinfo;
-}
-
-void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
-{
-    nbpu_opt_ = nbpu_opt;
-}
-
-void Mdrunner::BuilderImplementation::addPME(const char* pme_opt, const char* pme_fft_opt)
-{
-    pme_opt_     = pme_opt;
-    pme_fft_opt_ = pme_fft_opt;
-}
-
-void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
-{
-    bonded_opt_ = bonded_opt;
-}
-
-void Mdrunner::BuilderImplementation::addUpdateTaskAssignment(const char* update_opt)
-{
-    update_opt_ = update_opt;
-}
-
-void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t& hardwareOptions)
-{
-    hardwareOptions_ = hardwareOptions;
-}
-
-void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
-{
-    filenames_ = filenames;
-}
-
-void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
-{
-    outputEnvironment_ = outputEnvironment;
-}
-
-void Mdrunner::BuilderImplementation::addLogFile(t_fileio* logFileHandle)
-{
-    logFileHandle_ = logFileHandle;
-}
-
-void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
-{
-    stopHandlerBuilder_ = std::move(builder);
-}
-
-void Mdrunner::BuilderImplementation::addInput(SimulationInputHandle inputHolder)
-{
-    inputHolder_ = std::move(inputHolder);
-}
-
-MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules,
-                                 compat::not_null<SimulationContext*> context) :
-    impl_{ std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules), context) }
-{
-}
-
-MdrunnerBuilder::~MdrunnerBuilder() = default;
-
-MdrunnerBuilder& MdrunnerBuilder::addHardwareDetectionResult(const gmx_hw_info_t* hwinfo)
-{
-    impl_->addHardwareDetectionResult(hwinfo);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addSimulationMethod(const MdrunOptions&    options,
-                                                      real                   forceWarningThreshold,
-                                                      const StartingBehavior startingBehavior)
-{
-    impl_->setExtraMdrunOptions(options, forceWarningThreshold, startingBehavior);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addDomainDecomposition(const DomdecOptions& options)
-{
-    impl_->addDomdec(options);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addNeighborList(int nstlist)
-{
-    impl_->addVerletList(nstlist);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters& params)
-{
-    impl_->addReplicaExchange(params);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
-{
-    impl_->addNonBonded(nbpu_opt);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addElectrostatics(const char* pme_opt, const char* pme_fft_opt)
-{
-    // The builder method may become more general in the future, but in this version,
-    // parameters for PME electrostatics are both required and the only parameters
-    // available.
-    if (pme_opt && pme_fft_opt)
-    {
-        impl_->addPME(pme_opt, pme_fft_opt);
-    }
-    else
-    {
-        GMX_THROW(
-                gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
-    }
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
-{
-    impl_->addBondedTaskAssignment(bonded_opt);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addUpdateTaskAssignment(const char* update_opt)
-{
-    impl_->addUpdateTaskAssignment(update_opt);
-    return *this;
-}
-
-Mdrunner MdrunnerBuilder::build()
-{
-    return impl_->build();
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t& hardwareOptions)
-{
-    impl_->addHardwareOptions(hardwareOptions);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
-{
-    impl_->addFilenames(filenames);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
-{
-    impl_->addOutputEnvironment(outputEnvironment);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addLogFile(t_fileio* logFileHandle)
-{
-    impl_->addLogFile(logFileHandle);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
-{
-    impl_->addStopHandlerBuilder(std::move(builder));
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addInput(SimulationInputHandle input)
-{
-    impl_->addInput(std::move(input));
-    return *this;
-}
-
-MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder&&) noexcept = default;
-
-MdrunnerBuilder& MdrunnerBuilder::operator=(MdrunnerBuilder&&) noexcept = default;
-
-} // namespace gmx
diff --git a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/runner.cpp.preplumed b/patches/gromacs-2021.7.diff/src/gromacs/mdrun/runner.cpp.preplumed
deleted file mode 100644
index 232d994e1a..0000000000
--- a/patches/gromacs-2021.7.diff/src/gromacs/mdrun/runner.cpp.preplumed
+++ /dev/null
@@ -1,2352 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief Implements the MD runner routine calling all integrators.
- *
- * \author David van der Spoel <david.vanderspoel@icm.uu.se>
- * \ingroup module_mdrun
- */
-#include "gmxpre.h"
-
-#include "runner.h"
-
-#include "config.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <csignal>
-#include <cstdlib>
-#include <cstring>
-
-#include <algorithm>
-#include <memory>
-
-#include "gromacs/commandline/filenm.h"
-#include "gromacs/domdec/builder.h"
-#include "gromacs/domdec/domdec.h"
-#include "gromacs/domdec/domdec_struct.h"
-#include "gromacs/domdec/gpuhaloexchange.h"
-#include "gromacs/domdec/localatomsetmanager.h"
-#include "gromacs/domdec/partition.h"
-#include "gromacs/ewald/ewald_utils.h"
-#include "gromacs/ewald/pme_gpu_program.h"
-#include "gromacs/ewald/pme_only.h"
-#include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/fileio/checkpoint.h"
-#include "gromacs/fileio/gmxfio.h"
-#include "gromacs/fileio/oenv.h"
-#include "gromacs/fileio/tpxio.h"
-#include "gromacs/gmxlib/network.h"
-#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gpu_utils/device_stream_manager.h"
-#include "gromacs/hardware/cpuinfo.h"
-#include "gromacs/hardware/detecthardware.h"
-#include "gromacs/hardware/device_management.h"
-#include "gromacs/hardware/hardwaretopology.h"
-#include "gromacs/hardware/printhardware.h"
-#include "gromacs/imd/imd.h"
-#include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/gpubonded.h"
-#include "gromacs/listed_forces/listed_forces.h"
-#include "gromacs/listed_forces/orires.h"
-#include "gromacs/math/functions.h"
-#include "gromacs/math/utilities.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/boxdeformation.h"
-#include "gromacs/mdlib/broadcaststructs.h"
-#include "gromacs/mdlib/calc_verletbuf.h"
-#include "gromacs/mdlib/dispersioncorrection.h"
-#include "gromacs/mdlib/enerdata_utils.h"
-#include "gromacs/mdlib/force.h"
-#include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/gpuforcereduction.h"
-#include "gromacs/mdlib/makeconstraints.h"
-#include "gromacs/mdlib/md_support.h"
-#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/sighandler.h"
-#include "gromacs/mdlib/stophandler.h"
-#include "gromacs/mdlib/tgroup.h"
-#include "gromacs/mdlib/updategroups.h"
-#include "gromacs/mdlib/vsite.h"
-#include "gromacs/mdrun/mdmodules.h"
-#include "gromacs/mdrun/simulationcontext.h"
-#include "gromacs/mdrun/simulationinput.h"
-#include "gromacs/mdrun/simulationinputhandle.h"
-#include "gromacs/mdrunutility/handlerestart.h"
-#include "gromacs/mdrunutility/logging.h"
-#include "gromacs/mdrunutility/multisim.h"
-#include "gromacs/mdrunutility/printtime.h"
-#include "gromacs/mdrunutility/threadaffinity.h"
-#include "gromacs/mdtypes/checkpointdata.h"
-#include "gromacs/mdtypes/commrec.h"
-#include "gromacs/mdtypes/enerdata.h"
-#include "gromacs/mdtypes/fcdata.h"
-#include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/group.h"
-#include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/mdtypes/interaction_const.h"
-#include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/mdtypes/mdrunoptions.h"
-#include "gromacs/mdtypes/observableshistory.h"
-#include "gromacs/mdtypes/simulation_workload.h"
-#include "gromacs/mdtypes/state.h"
-#include "gromacs/mdtypes/state_propagator_data_gpu.h"
-#include "gromacs/modularsimulator/modularsimulator.h"
-#include "gromacs/nbnxm/gpu_data_mgmt.h"
-#include "gromacs/nbnxm/nbnxm.h"
-#include "gromacs/nbnxm/pairlist_tuning.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pulling/output.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/pulling/pull_rotation.h"
-#include "gromacs/restraint/manager.h"
-#include "gromacs/restraint/restraintmdmodule.h"
-#include "gromacs/restraint/restraintpotential.h"
-#include "gromacs/swap/swapcoords.h"
-#include "gromacs/taskassignment/decidegpuusage.h"
-#include "gromacs/taskassignment/decidesimulationworkload.h"
-#include "gromacs/taskassignment/resourcedivision.h"
-#include "gromacs/taskassignment/taskassignment.h"
-#include "gromacs/taskassignment/usergpuids.h"
-#include "gromacs/timing/gpu_timing.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/wallcyclereporting.h"
-#include "gromacs/topology/mtop_util.h"
-#include "gromacs/trajectory/trajectoryframe.h"
-#include "gromacs/utility/basenetwork.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/utility/exceptions.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/filestream.h"
-#include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/keyvaluetree.h"
-#include "gromacs/utility/logger.h"
-#include "gromacs/utility/loggerbuilder.h"
-#include "gromacs/utility/mdmodulenotification.h"
-#include "gromacs/utility/physicalnodecommunicator.h"
-#include "gromacs/utility/pleasecite.h"
-#include "gromacs/utility/programcontext.h"
-#include "gromacs/utility/smalloc.h"
-#include "gromacs/utility/stringutil.h"
-
-#include "isimulator.h"
-#include "membedholder.h"
-#include "replicaexchange.h"
-#include "simulatorbuilder.h"
-
-namespace gmx
-{
-
-
-/*! \brief Manage any development feature flag variables encountered
- *
- * The use of dev features indicated by environment variables is
- * logged in order to ensure that runs with such features enabled can
- * be identified from their log and standard output. Any cross
- * dependencies are also checked, and if unsatisfied, a fatal error
- * issued.
- *
- * Note that some development features overrides are applied already here:
- * the GPU communication flags are set to false in non-tMPI and non-CUDA builds.
- *
- * \param[in]  mdlog                Logger object.
- * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
- * \param[in]  pmeRunMode           The PME run mode for this run
- * \returns                         The object populated with development feature flags.
- */
-static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& mdlog,
-                                                         const bool           useGpuForNonbonded,
-                                                         const PmeRunMode     pmeRunMode)
-{
-    DevelopmentFeatureFlags devFlags;
-
-    // Some builds of GCC 5 give false positive warnings that these
-    // getenv results are ignored when clearly they are used.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-result"
-
-    devFlags.enableGpuBufferOps =
-            GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
-    devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_DD_COMMS") != nullptr;
-    devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
-    devFlags.enableGpuPmePPComm =
-            GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
-
-#pragma GCC diagnostic pop
-
-    if (devFlags.enableGpuBufferOps)
-    {
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendTextFormatted(
-                        "This run uses the 'GPU buffer ops' feature, enabled by the "
-                        "GMX_USE_GPU_BUFFER_OPS environment variable.");
-    }
-
-    if (devFlags.forceGpuUpdateDefault)
-    {
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendTextFormatted(
-                        "This run will default to '-update gpu' as requested by the "
-                        "GMX_FORCE_UPDATE_DEFAULT_GPU environment variable. GPU update with domain "
-                        "decomposition lacks substantial testing and should be used with caution.");
-    }
-
-    if (devFlags.enableGpuHaloExchange)
-    {
-        if (useGpuForNonbonded)
-        {
-            if (!devFlags.enableGpuBufferOps)
-            {
-                GMX_LOG(mdlog.warning)
-                        .asParagraph()
-                        .appendTextFormatted(
-                                "Enabling GPU buffer operations required by GMX_GPU_DD_COMMS "
-                                "(equivalent with GMX_USE_GPU_BUFFER_OPS=1).");
-                devFlags.enableGpuBufferOps = true;
-            }
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendTextFormatted(
-                            "This run has requested the 'GPU halo exchange' feature, enabled by "
-                            "the "
-                            "GMX_GPU_DD_COMMS environment variable.");
-        }
-        else
-        {
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendTextFormatted(
-                            "GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
-                            "halo exchange' feature will not be enabled as nonbonded interactions "
-                            "are not offloaded.");
-            devFlags.enableGpuHaloExchange = false;
-        }
-    }
-
-    if (devFlags.enableGpuPmePPComm)
-    {
-        if (pmeRunMode == PmeRunMode::GPU)
-        {
-            if (!devFlags.enableGpuBufferOps)
-            {
-                GMX_LOG(mdlog.warning)
-                        .asParagraph()
-                        .appendTextFormatted(
-                                "Enabling GPU buffer operations required by GMX_GPU_PME_PP_COMMS "
-                                "(equivalent with GMX_USE_GPU_BUFFER_OPS=1).");
-                devFlags.enableGpuBufferOps = true;
-            }
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendTextFormatted(
-                            "This run uses the 'GPU PME-PP communications' feature, enabled "
-                            "by the GMX_GPU_PME_PP_COMMS environment variable.");
-        }
-        else
-        {
-            std::string clarification;
-            if (pmeRunMode == PmeRunMode::Mixed)
-            {
-                clarification =
-                        "PME FFT and gather are not offloaded to the GPU (PME is running in mixed "
-                        "mode).";
-            }
-            else
-            {
-                clarification = "PME is not offloaded to the GPU.";
-            }
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendText(
-                            "GMX_GPU_PME_PP_COMMS environment variable detected, but the "
-                            "'GPU PME-PP communications' feature was not enabled as "
-                            + clarification);
-            devFlags.enableGpuPmePPComm = false;
-        }
-    }
-
-    return devFlags;
-}
-
-/*! \brief Barrier for safe simultaneous thread access to mdrunner data
- *
- * Used to ensure that the master thread does not modify mdrunner during copy
- * on the spawned threads. */
-static void threadMpiMdrunnerAccessBarrier()
-{
-#if GMX_THREAD_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-}
-
-Mdrunner Mdrunner::cloneOnSpawnedThread() const
-{
-    auto newRunner = Mdrunner(std::make_unique<MDModules>());
-
-    // All runners in the same process share a restraint manager resource because it is
-    // part of the interface to the client code, which is associated only with the
-    // original thread. Handles to the same resources can be obtained by copy.
-    {
-        newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
-    }
-
-    // Copy members of master runner.
-    // \todo Replace with builder when Simulation context and/or runner phases are better defined.
-    // Ref https://gitlab.com/gromacs/gromacs/-/issues/2587 and https://gitlab.com/gromacs/gromacs/-/issues/2375
-    newRunner.hw_opt    = hw_opt;
-    newRunner.filenames = filenames;
-
-    newRunner.hwinfo_         = hwinfo_;
-    newRunner.oenv            = oenv;
-    newRunner.mdrunOptions    = mdrunOptions;
-    newRunner.domdecOptions   = domdecOptions;
-    newRunner.nbpu_opt        = nbpu_opt;
-    newRunner.pme_opt         = pme_opt;
-    newRunner.pme_fft_opt     = pme_fft_opt;
-    newRunner.bonded_opt      = bonded_opt;
-    newRunner.update_opt      = update_opt;
-    newRunner.nstlist_cmdline = nstlist_cmdline;
-    newRunner.replExParams    = replExParams;
-    newRunner.pforce          = pforce;
-    // Give the spawned thread the newly created valid communicator
-    // for the simulation.
-    newRunner.libraryWorldCommunicator = MPI_COMM_WORLD;
-    newRunner.simulationCommunicator   = MPI_COMM_WORLD;
-    newRunner.ms                       = ms;
-    newRunner.startingBehavior         = startingBehavior;
-    newRunner.stopHandlerBuilder_      = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
-    newRunner.inputHolder_             = inputHolder_;
-
-    threadMpiMdrunnerAccessBarrier();
-
-    return newRunner;
-}
-
-/*! \brief The callback used for running on spawned threads.
- *
- * Obtains the pointer to the master mdrunner object from the one
- * argument permitted to the thread-launch API call, copies it to make
- * a new runner for this thread, reinitializes necessary data, and
- * proceeds to the simulation. */
-static void mdrunner_start_fn(const void* arg)
-{
-    try
-    {
-        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
-        /* copy the arg list to make sure that it's thread-local. This
-           doesn't copy pointed-to items, of course; fnm, cr and fplog
-           are reset in the call below, all others should be const. */
-        gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
-        mdrunner.mdrunner();
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-}
-
-
-void Mdrunner::spawnThreads(int numThreadsToLaunch)
-{
-#if GMX_THREAD_MPI
-    /* now spawn new threads that start mdrunner_start_fn(), while
-       the main thread returns. Thread affinity is handled later. */
-    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, mdrunner_start_fn,
-                     static_cast<const void*>(this))
-        != TMPI_SUCCESS)
-    {
-        GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
-    }
-
-    // Give the master thread the newly created valid communicator for
-    // the simulation.
-    libraryWorldCommunicator = MPI_COMM_WORLD;
-    simulationCommunicator   = MPI_COMM_WORLD;
-    threadMpiMdrunnerAccessBarrier();
-#else
-    GMX_UNUSED_VALUE(numThreadsToLaunch);
-    GMX_UNUSED_VALUE(mdrunner_start_fn);
-#endif
-}
-
-} // namespace gmx
-
-/*! \brief Initialize variables for Verlet scheme simulation */
-static void prepare_verlet_scheme(FILE*               fplog,
-                                  t_commrec*          cr,
-                                  t_inputrec*         ir,
-                                  int                 nstlist_cmdline,
-                                  const gmx_mtop_t*   mtop,
-                                  const matrix        box,
-                                  bool                makeGpuPairList,
-                                  const gmx::CpuInfo& cpuinfo)
-{
-    // We checked the cut-offs in grompp, but double-check here.
-    // We have PME+LJcutoff kernels for rcoulomb>rvdw.
-    if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == eelCUT)
-    {
-        GMX_RELEASE_ASSERT(ir->rcoulomb >= ir->rvdw,
-                           "With Verlet lists and PME we should have rcoulomb>=rvdw");
-    }
-    else
-    {
-        GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw,
-                           "With Verlet lists and no PME rcoulomb and rvdw should be identical");
-    }
-    /* For NVE simulations, we will retain the initial list buffer */
-    if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
-    {
-        /* Update the Verlet buffer size for the current run setup */
-
-        /* Here we assume SIMD-enabled kernels are being used. But as currently
-         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
-         * and 4x2 gives a larger buffer than 4x4, this is ok.
-         */
-        ListSetupType listType =
-                (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
-        VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
-
-        const real rlist_new =
-                calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
-
-        if (rlist_new != ir->rlist)
-        {
-            if (fplog != nullptr)
-            {
-                fprintf(fplog,
-                        "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
-                        ir->rlist, rlist_new, listSetup.cluster_size_i, listSetup.cluster_size_j);
-            }
-            ir->rlist = rlist_new;
-        }
-    }
-
-    if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
-    {
-        gmx_fatal(FARGS, "Can not set nstlist without %s",
-                  !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
-    }
-
-    if (EI_DYNAMICS(ir->eI))
-    {
-        /* Set or try nstlist values */
-        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
-    }
-}
-
-/*! \brief Override the nslist value in inputrec
- *
- * with value passed on the command line (if any)
- */
-static void override_nsteps_cmdline(const gmx::MDLogger& mdlog, int64_t nsteps_cmdline, t_inputrec* ir)
-{
-    assert(ir);
-
-    /* override with anything else than the default -2 */
-    if (nsteps_cmdline > -2)
-    {
-        char sbuf_steps[STEPSTRSIZE];
-        char sbuf_msg[STRLEN];
-
-        ir->nsteps = nsteps_cmdline;
-        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
-        {
-            sprintf(sbuf_msg,
-                    "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
-                    gmx_step_str(nsteps_cmdline, sbuf_steps), fabs(nsteps_cmdline * ir->delta_t));
-        }
-        else
-        {
-            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
-                    gmx_step_str(nsteps_cmdline, sbuf_steps));
-        }
-
-        GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
-    }
-    else if (nsteps_cmdline < -2)
-    {
-        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64, nsteps_cmdline);
-    }
-    /* Do nothing if nsteps_cmdline == -2 */
-}
-
-namespace gmx
-{
-
-/*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
- *
- * If not, and if a warning may be issued, logs a warning about
- * falling back to CPU code. With thread-MPI, only the first
- * call to this function should have \c issueWarning true. */
-static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger& mdlog, const t_inputrec& ir, bool issueWarning)
-{
-    bool        gpuIsUseful = true;
-    std::string warning;
-
-    if (ir.opts.ngener - ir.nwall > 1)
-    {
-        /* The GPU code does not support more than one energy group.
-         * If the user requested GPUs explicitly, a fatal error is given later.
-         */
-        gpuIsUseful = false;
-        warning =
-                "Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
-                "For better performance, run on the GPU without energy groups and then do "
-                "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.";
-    }
-
-    if (EI_TPI(ir.eI))
-    {
-        gpuIsUseful = false;
-        warning     = "TPI is not implemented for GPUs.";
-    }
-
-    if (!gpuIsUseful && issueWarning)
-    {
-        GMX_LOG(mdlog.warning).asParagraph().appendText(warning);
-    }
-
-    return gpuIsUseful;
-}
-
-//! Initializes the logger for mdrun.
-static gmx::LoggerOwner buildLogger(FILE* fplog, const bool isSimulationMasterRank)
-{
-    gmx::LoggerBuilder builder;
-    if (fplog != nullptr)
-    {
-        builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
-    }
-    if (isSimulationMasterRank)
-    {
-        builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
-    }
-    return builder.build();
-}
-
-//! Make a TaskTarget from an mdrun argument string.
-static TaskTarget findTaskTarget(const char* optionString)
-{
-    TaskTarget returnValue = TaskTarget::Auto;
-
-    if (strncmp(optionString, "auto", 3) == 0)
-    {
-        returnValue = TaskTarget::Auto;
-    }
-    else if (strncmp(optionString, "cpu", 3) == 0)
-    {
-        returnValue = TaskTarget::Cpu;
-    }
-    else if (strncmp(optionString, "gpu", 3) == 0)
-    {
-        returnValue = TaskTarget::Gpu;
-    }
-    else
-    {
-        GMX_ASSERT(false, "Option string should have been checked for sanity already");
-    }
-
-    return returnValue;
-}
-
-//! Finish run, aggregate data to print performance info.
-static void finish_run(FILE*                     fplog,
-                       const gmx::MDLogger&      mdlog,
-                       const t_commrec*          cr,
-                       const t_inputrec*         inputrec,
-                       t_nrnb                    nrnb[],
-                       gmx_wallcycle_t           wcycle,
-                       gmx_walltime_accounting_t walltime_accounting,
-                       nonbonded_verlet_t*       nbv,
-                       const gmx_pme_t*          pme,
-                       gmx_bool                  bWriteStat)
-{
-    double delta_t = 0;
-    double nbfs = 0, mflop = 0;
-    double elapsed_time, elapsed_time_over_all_ranks, elapsed_time_over_all_threads,
-            elapsed_time_over_all_threads_over_all_ranks;
-    /* Control whether it is valid to print a report. Only the
-       simulation master may print, but it should not do so if the run
-       terminated e.g. before a scheduled reset step. This is
-       complicated by the fact that PME ranks are unaware of the
-       reason why they were sent a pmerecvqxFINISH. To avoid
-       communication deadlocks, we always do the communication for the
-       report, even if we've decided not to write the report, because
-       how long it takes to finish the run is not important when we've
-       decided not to report on the simulation performance.
-
-       Further, we only report performance for dynamical integrators,
-       because those are the only ones for which we plan to
-       consider doing any optimizations. */
-    bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
-
-    if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
-    {
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendText("Simulation ended prematurely, no performance report will be written.");
-        printReport = false;
-    }
-
-    t_nrnb*                 nrnb_tot;
-    std::unique_ptr<t_nrnb> nrnbTotalStorage;
-    if (cr->nnodes > 1)
-    {
-        nrnbTotalStorage = std::make_unique<t_nrnb>();
-        nrnb_tot         = nrnbTotalStorage.get();
-#if GMX_MPI
-        MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
-#endif
-    }
-    else
-    {
-        nrnb_tot = nrnb;
-    }
-
-    elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
-    elapsed_time_over_all_threads =
-            walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
-    if (cr->nnodes > 1)
-    {
-#if GMX_MPI
-        /* reduce elapsed_time over all MPI ranks in the current simulation */
-        MPI_Allreduce(&elapsed_time, &elapsed_time_over_all_ranks, 1, MPI_DOUBLE, MPI_SUM,
-                      cr->mpi_comm_mysim);
-        elapsed_time_over_all_ranks /= cr->nnodes;
-        /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
-         * current simulation. */
-        MPI_Allreduce(&elapsed_time_over_all_threads, &elapsed_time_over_all_threads_over_all_ranks,
-                      1, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
-#endif
-    }
-    else
-    {
-        elapsed_time_over_all_ranks                  = elapsed_time;
-        elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
-    }
-
-    if (printReport)
-    {
-        print_flop(fplog, nrnb_tot, &nbfs, &mflop);
-    }
-
-    if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
-    {
-        print_dd_statistics(cr, inputrec, fplog);
-    }
-
-    /* TODO Move the responsibility for any scaling by thread counts
-     * to the code that handled the thread region, so that there's a
-     * mechanism to keep cycle counting working during the transition
-     * to task parallelism. */
-    int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
-    int nthreads_pme = gmx_omp_nthreads_get(emntPME);
-    wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP),
-                                   nthreads_pp, nthreads_pme);
-    auto cycle_sum(wallcycle_sum(cr, wcycle));
-
-    if (printReport)
-    {
-        auto nbnxn_gpu_timings =
-                (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
-        gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
-
-        if (pme_gpu_task_enabled(pme))
-        {
-            pme_gpu_get_timings(pme, &pme_gpu_timings);
-        }
-        wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
-                        elapsed_time_over_all_ranks, wcycle, cycle_sum, nbnxn_gpu_timings,
-                        &pme_gpu_timings);
-
-        if (EI_DYNAMICS(inputrec->eI))
-        {
-            delta_t = inputrec->delta_t;
-        }
-
-        if (fplog)
-        {
-            print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
-                       walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
-                       delta_t, nbfs, mflop);
-        }
-        if (bWriteStat)
-        {
-            print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
-                       walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
-                       delta_t, nbfs, mflop);
-        }
-    }
-}
-
-int Mdrunner::mdrunner()
-{
-    matrix                    box;
-    t_forcerec*               fr               = nullptr;
-    real                      ewaldcoeff_q     = 0;
-    real                      ewaldcoeff_lj    = 0;
-    int                       nChargePerturbed = -1, nTypePerturbed = 0;
-    gmx_wallcycle_t           wcycle;
-    gmx_walltime_accounting_t walltime_accounting = nullptr;
-    MembedHolder              membedHolder(filenames.size(), filenames.data());
-
-    /* CAUTION: threads may be started later on in this function, so
-       cr doesn't reflect the final parallel state right now */
-    gmx_mtop_t mtop;
-
-    /* TODO: inputrec should tell us whether we use an algorithm, not a file option */
-    const bool doEssentialDynamics = opt2bSet("-ei", filenames.size(), filenames.data());
-    const bool doRerun             = mdrunOptions.rerun;
-
-    // Handle task-assignment related user options.
-    EmulateGpuNonbonded emulateGpuNonbonded =
-            (getenv("GMX_EMULATE_GPU") != nullptr ? EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
-
-    std::vector<int> userGpuTaskAssignment;
-    try
-    {
-        userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-    auto nonbondedTarget = findTaskTarget(nbpu_opt);
-    auto pmeTarget       = findTaskTarget(pme_opt);
-    auto pmeFftTarget    = findTaskTarget(pme_fft_opt);
-    auto bondedTarget    = findTaskTarget(bonded_opt);
-    auto updateTarget    = findTaskTarget(update_opt);
-
-    FILE* fplog = nullptr;
-    // If we are appending, we don't write log output because we need
-    // to check that the old log file matches what the checkpoint file
-    // expects. Otherwise, we should start to write log output now if
-    // there is a file ready for it.
-    if (logFileHandle != nullptr && startingBehavior != StartingBehavior::RestartWithAppending)
-    {
-        fplog = gmx_fio_getfp(logFileHandle);
-    }
-    const bool isSimulationMasterRank = findIsSimulationMasterRank(ms, simulationCommunicator);
-    gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
-    gmx::MDLogger    mdlog(logOwner.logger());
-
-    gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo_);
-
-    std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo_->deviceInfoList, hw_opt.gpuIdsAvailable);
-    const int        numDevicesToUse = gmx::ssize(gpuIdsToUse);
-
-    // Print citation requests after all software/hardware printing
-    pleaseCiteGromacs(fplog);
-
-    // Note: legacy program logic relies on checking whether these pointers are assigned.
-    // Objects may or may not be allocated later.
-    std::unique_ptr<t_inputrec> inputrec;
-    std::unique_ptr<t_state>    globalState;
-
-    auto partialDeserializedTpr = std::make_unique<PartialDeserializedTprFile>();
-
-    if (isSimulationMasterRank)
-    {
-        // Allocate objects to be initialized by later function calls.
-        /* Only the master rank has the global state */
-        globalState = std::make_unique<t_state>();
-        inputrec    = std::make_unique<t_inputrec>();
-
-        /* Read (nearly) all data required for the simulation
-         * and keep the partly serialized tpr contents to send to other ranks later
-         */
-        applyGlobalSimulationState(*inputHolder_.get(), partialDeserializedTpr.get(),
-                                   globalState.get(), inputrec.get(), &mtop);
-    }
-
-    /* Check and update the hardware options for internal consistency */
-    checkAndUpdateHardwareOptions(mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks,
-                                  inputrec.get());
-
-    if (GMX_THREAD_MPI && isSimulationMasterRank)
-    {
-        bool useGpuForNonbonded = false;
-        bool useGpuForPme       = false;
-        try
-        {
-            GMX_RELEASE_ASSERT(inputrec != nullptr, "Keep the compiler happy");
-
-            // If the user specified the number of ranks, then we must
-            // respect that, but in default mode, we need to allow for
-            // the number of GPUs to choose the number of ranks.
-            auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
-            useGpuForNonbonded         = decideWhetherToUseGpusForNonbondedWithThreadMpi(
-                    nonbondedTarget, numDevicesToUse, userGpuTaskAssignment, emulateGpuNonbonded,
-                    canUseGpuForNonbonded,
-                    gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
-                    hw_opt.nthreads_tmpi);
-            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(
-                    useGpuForNonbonded, pmeTarget, pmeFftTarget, numDevicesToUse, userGpuTaskAssignment,
-                    *hwinfo_, *inputrec, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
-        }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-
-        /* Determine how many thread-MPI ranks to start.
-         *
-         * TODO Over-writing the user-supplied value here does
-         * prevent any possible subsequent checks from working
-         * correctly. */
-        hw_opt.nthreads_tmpi =
-                get_nthreads_mpi(hwinfo_, &hw_opt, numDevicesToUse, useGpuForNonbonded, useGpuForPme,
-                                 inputrec.get(), &mtop, mdlog, membedHolder.doMembed());
-
-        // Now start the threads for thread MPI.
-        spawnThreads(hw_opt.nthreads_tmpi);
-        // The spawned threads enter mdrunner() and execution of
-        // master and spawned threads joins at the end of this block.
-    }
-
-    GMX_RELEASE_ASSERT(!GMX_MPI || ms || simulationCommunicator != MPI_COMM_NULL,
-                       "Must have valid communicator unless running a multi-simulation");
-    CommrecHandle crHandle = init_commrec(simulationCommunicator);
-    t_commrec*    cr       = crHandle.get();
-    GMX_RELEASE_ASSERT(cr != nullptr, "Must have valid commrec");
-
-    PhysicalNodeCommunicator physicalNodeComm(libraryWorldCommunicator, gmx_physicalnode_id_hash());
-
-    // If we detected the topology on this system, double-check that it makes sense
-    if (hwinfo_->hardwareTopology->isThisSystem())
-    {
-        hardwareTopologyDoubleCheckDetection(mdlog, *hwinfo_->hardwareTopology);
-    }
-
-    if (PAR(cr))
-    {
-        /* now broadcast everything to the non-master nodes/threads: */
-        if (!isSimulationMasterRank)
-        {
-            // Until now, only the master rank has a non-null pointer.
-            // On non-master ranks, allocate the object that will receive data in the following call.
-            inputrec = std::make_unique<t_inputrec>();
-        }
-        init_parallel(cr->mpiDefaultCommunicator, MASTER(cr), inputrec.get(), &mtop,
-                      partialDeserializedTpr.get());
-    }
-    GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
-    partialDeserializedTpr.reset(nullptr);
-
-    // Now the number of ranks is known to all ranks, and each knows
-    // the inputrec read by the master rank. The ranks can now all run
-    // the task-deciding functions and will agree on the result
-    // without needing to communicate.
-    const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
-
-    // Note that these variables describe only their own node.
-    //
-    // Note that when bonded interactions run on a GPU they always run
-    // alongside a nonbonded task, so do not influence task assignment
-    // even though they affect the force calculation workload.
-    bool useGpuForNonbonded = false;
-    bool useGpuForPme       = false;
-    bool useGpuForBonded    = false;
-    bool useGpuForUpdate    = false;
-    bool gpusWereDetected   = hwinfo_->ngpu_compatible_tot > 0;
-    try
-    {
-        // It's possible that there are different numbers of GPUs on
-        // different nodes, which is the user's responsibility to
-        // handle. If unsuitable, we will notice that during task
-        // assignment.
-        auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
-        useGpuForNonbonded         = decideWhetherToUseGpusForNonbonded(
-                nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded,
-                gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected);
-        useGpuForPme = decideWhetherToUseGpusForPme(
-                useGpuForNonbonded, pmeTarget, pmeFftTarget, userGpuTaskAssignment, *hwinfo_,
-                *inputrec, cr->sizeOfDefaultCommunicator, domdecOptions.numPmeRanks, gpusWereDetected);
-        useGpuForBonded = decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme,
-                                                          bondedTarget, *inputrec, mtop,
-                                                          domdecOptions.numPmeRanks, gpusWereDetected);
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-
-    const PmeRunMode pmeRunMode = determinePmeRunMode(useGpuForPme, pmeFftTarget, *inputrec);
-
-    // Initialize development feature flags that enabled by environment variable
-    // and report those features that are enabled.
-    const DevelopmentFeatureFlags devFlags =
-            manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
-
-    const bool useModularSimulator =
-            checkUseModularSimulator(false, inputrec.get(), doRerun, mtop, ms, replExParams,
-                                     nullptr, doEssentialDynamics, membedHolder.doMembed());
-
-    // Build restraints.
-    // TODO: hide restraint implementation details from Mdrunner.
-    // There is nothing unique about restraints at this point as far as the
-    // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
-    // factory functions from the SimulationContext on which to call mdModules_->add().
-    // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
-    for (auto&& restraint : restraintManager_->getRestraints())
-    {
-        auto module = RestraintMDModule::create(restraint, restraint->sites());
-        mdModules_->add(std::move(module));
-    }
-
-    // TODO: Error handling
-    mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
-    // now that the MdModules know their options, they know which callbacks to sign up to
-    mdModules_->subscribeToSimulationSetupNotifications();
-    const auto& mdModulesNotifier = mdModules_->notifier().simulationSetupNotifications_;
-
-    if (inputrec->internalParameters != nullptr)
-    {
-        mdModulesNotifier.notify(*inputrec->internalParameters);
-    }
-
-    if (fplog != nullptr)
-    {
-        pr_inputrec(fplog, 0, "Input Parameters", inputrec.get(), FALSE);
-        fprintf(fplog, "\n");
-    }
-
-    if (SIMMASTER(cr))
-    {
-        /* In rerun, set velocities to zero if present */
-        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
-        {
-            // rerun does not use velocities
-            GMX_LOG(mdlog.info)
-                    .asParagraph()
-                    .appendText(
-                            "Rerun trajectory contains velocities. Rerun does only evaluate "
-                            "potential energy and forces. The velocities will be ignored.");
-            for (int i = 0; i < globalState->natoms; i++)
-            {
-                clear_rvec(globalState->v[i]);
-            }
-            globalState->flags &= ~(1 << estV);
-        }
-
-        /* now make sure the state is initialized and propagated */
-        set_state_entries(globalState.get(), inputrec.get(), useModularSimulator);
-    }
-
-    /* NM and TPI parallelize over force/energy calculations, not atoms,
-     * so we need to initialize and broadcast the global state.
-     */
-    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
-    {
-        if (!MASTER(cr))
-        {
-            globalState = std::make_unique<t_state>();
-        }
-        broadcastStateWithoutDynamics(cr->mpiDefaultCommunicator, DOMAINDECOMP(cr), PAR(cr),
-                                      globalState.get());
-    }
-
-    /* A parallel command line option consistency check that we can
-       only do after any threads have started. */
-    if (!PAR(cr)
-        && (domdecOptions.numCells[XX] > 1 || domdecOptions.numCells[YY] > 1
-            || domdecOptions.numCells[ZZ] > 1 || domdecOptions.numPmeRanks > 0))
-    {
-        gmx_fatal(FARGS,
-                  "The -dd or -npme option request a parallel simulation, "
-#if !GMX_MPI
-                  "but %s was compiled without threads or MPI enabled",
-                  output_env_get_program_display_name(oenv));
-#elif GMX_THREAD_MPI
-                  "but the number of MPI-threads (option -ntmpi) is not set or is 1");
-#else
-                  "but %s was not started through mpirun/mpiexec or only one rank was requested "
-                  "through mpirun/mpiexec",
-                  output_env_get_program_display_name(oenv));
-#endif
-    }
-
-    if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
-    {
-        gmx_fatal(FARGS,
-                  "The .mdp file specified an energy mininization or normal mode algorithm, and "
-                  "these are not compatible with mdrun -rerun");
-    }
-
-    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
-    {
-        if (domdecOptions.numPmeRanks > 0)
-        {
-            gmx_fatal_collective(FARGS, cr->mpiDefaultCommunicator, MASTER(cr),
-                                 "PME-only ranks are requested, but the system does not use PME "
-                                 "for electrostatics or LJ");
-        }
-
-        domdecOptions.numPmeRanks = 0;
-    }
-
-    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
-    {
-        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
-         * improve performance with many threads per GPU, since our OpenMP
-         * scaling is bad, but it's difficult to automate the setup.
-         */
-        domdecOptions.numPmeRanks = 0;
-    }
-    if (useGpuForPme)
-    {
-        if (domdecOptions.numPmeRanks < 0)
-        {
-            domdecOptions.numPmeRanks = 0;
-            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
-        }
-        else
-        {
-            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1,
-                               "PME GPU decomposition is not supported");
-        }
-    }
-
-    /* NMR restraints must be initialized before load_checkpoint,
-     * since with time averaging the history is added to t_state.
-     * For proper consistency check we therefore need to extend
-     * t_state here.
-     * So the PME-only nodes (if present) will also initialize
-     * the distance restraints.
-     */
-
-    /* This needs to be called before read_checkpoint to extend the state */
-    t_disresdata* disresdata;
-    snew(disresdata, 1);
-    init_disres(fplog, &mtop, inputrec.get(), DisResRunMode::MDRun,
-                MASTER(cr) ? DDRole::Master : DDRole::Agent,
-                PAR(cr) ? NumRanks::Multiple : NumRanks::Single, cr->mpi_comm_mysim, ms, disresdata,
-                globalState.get(), replExParams.exchangeInterval > 0);
-
-    t_oriresdata* oriresdata;
-    snew(oriresdata, 1);
-    init_orires(fplog, &mtop, inputrec.get(), cr, ms, globalState.get(), oriresdata);
-
-    auto deform = prepareBoxDeformation(
-            globalState != nullptr ? globalState->box : box, MASTER(cr) ? DDRole::Master : DDRole::Agent,
-            PAR(cr) ? NumRanks::Multiple : NumRanks::Single, cr->mpi_comm_mygroup, *inputrec);
-
-#if GMX_FAHCORE
-    /* We have to remember the generation's first step before reading checkpoint.
-       This way, we can report to the F@H core both the generation's first step
-       and the restored first step, thus making it able to distinguish between
-       an interruption/resume and start of the n-th generation simulation.
-       Having this information, the F@H core can correctly calculate and report
-       the progress.
-     */
-    int gen_first_step = 0;
-    if (MASTER(cr))
-    {
-        gen_first_step = inputrec->init_step;
-    }
-#endif
-
-    ObservablesHistory observablesHistory = {};
-
-    auto modularSimulatorCheckpointData = std::make_unique<ReadCheckpointDataHolder>();
-    if (startingBehavior != StartingBehavior::NewSimulation)
-    {
-        /* Check if checkpoint file exists before doing continuation.
-         * This way we can use identical input options for the first and subsequent runs...
-         */
-        if (mdrunOptions.numStepsCommandline > -2)
-        {
-            /* Temporarily set the number of steps to unlimited to avoid
-             * triggering the nsteps check in load_checkpoint().
-             * This hack will go away soon when the -nsteps option is removed.
-             */
-            inputrec->nsteps = -1;
-        }
-
-        // Finish applying initial simulation state information from external sources on all ranks.
-        // Reconcile checkpoint file data with Mdrunner state established up to this point.
-        applyLocalState(*inputHolder_.get(), logFileHandle, cr, domdecOptions.numCells,
-                        inputrec.get(), globalState.get(), &observablesHistory,
-                        mdrunOptions.reproducible, mdModules_->notifier(),
-                        modularSimulatorCheckpointData.get(), useModularSimulator);
-        // TODO: (#3652) Synchronize filesystem state, SimulationInput contents, and program
-        //  invariants
-        //  on all code paths.
-        // Write checkpoint or provide hook to update SimulationInput.
-        // If there was a checkpoint file, SimulationInput contains more information
-        // than if there wasn't. At this point, we have synchronized the in-memory
-        // state with the filesystem state only for restarted simulations. We should
-        // be calling applyLocalState unconditionally and expect that the completeness
-        // of SimulationInput is not dependent on its creation method.
-
-        if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
-        {
-            // Now we can start normal logging to the truncated log file.
-            fplog = gmx_fio_getfp(logFileHandle);
-            prepareLogAppending(fplog);
-            logOwner = buildLogger(fplog, MASTER(cr));
-            mdlog    = logOwner.logger();
-        }
-    }
-
-#if GMX_FAHCORE
-    if (MASTER(cr))
-    {
-        fcRegisterSteps(inputrec->nsteps + inputrec->init_step, gen_first_step);
-    }
-#endif
-
-    if (mdrunOptions.numStepsCommandline > -2)
-    {
-        GMX_LOG(mdlog.info)
-                .asParagraph()
-                .appendText(
-                        "The -nsteps functionality is deprecated, and may be removed in a future "
-                        "version. "
-                        "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp "
-                        "file field.");
-    }
-    /* override nsteps with value set on the commandline */
-    override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec.get());
-
-    if (isSimulationMasterRank)
-    {
-        copy_mat(globalState->box, box);
-    }
-
-    if (PAR(cr))
-    {
-        gmx_bcast(sizeof(box), box, cr->mpiDefaultCommunicator);
-    }
-
-    if (inputrec->cutoff_scheme != ecutsVERLET)
-    {
-        gmx_fatal(FARGS,
-                  "This group-scheme .tpr file can no longer be run by mdrun. Please update to the "
-                  "Verlet scheme, or use an earlier version of GROMACS if necessary.");
-    }
-    /* Update rlist and nstlist. */
-    /* Note: prepare_verlet_scheme is calling increaseNstlist(...), which (while attempting to
-     * increase rlist) tries to check if the newly chosen value fits with the DD scheme. As this is
-     * run before any DD scheme is set up, this check is never executed. See #3334 for more details.
-     */
-    prepare_verlet_scheme(fplog, cr, inputrec.get(), nstlist_cmdline, &mtop, box,
-                          useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
-                          *hwinfo_->cpuInfo);
-
-    // This builder is necessary while we have multi-part construction
-    // of DD. Before DD is constructed, we use the existence of
-    // the builder object to indicate that further construction of DD
-    // is needed.
-    std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
-    if (useDomainDecomposition)
-    {
-        ddBuilder = std::make_unique<DomainDecompositionBuilder>(
-                mdlog, cr, domdecOptions, mdrunOptions, mtop, *inputrec, box,
-                positionsFromStatePointer(globalState.get()));
-    }
-    else
-    {
-        /* PME, if used, is done on all nodes with 1D decomposition */
-        cr->nnodes     = cr->sizeOfDefaultCommunicator;
-        cr->sim_nodeid = cr->rankInDefaultCommunicator;
-        cr->nodeid     = cr->rankInDefaultCommunicator;
-        cr->npmenodes  = 0;
-        cr->duty       = (DUTY_PP | DUTY_PME);
-
-        if (inputrec->pbcType == PbcType::Screw)
-        {
-            gmx_fatal(FARGS, "pbc=screw is only implemented with domain decomposition");
-        }
-    }
-
-    // Produce the task assignment for this rank - done after DD is constructed
-    GpuTaskAssignments gpuTaskAssignments = GpuTaskAssignmentsBuilder::build(
-            gpuIdsToUse, userGpuTaskAssignment, *hwinfo_, simulationCommunicator, physicalNodeComm,
-            nonbondedTarget, pmeTarget, bondedTarget, updateTarget, useGpuForNonbonded,
-            useGpuForPme, thisRankHasDuty(cr, DUTY_PP),
-            // TODO cr->duty & DUTY_PME should imply that a PME
-            // algorithm is active, but currently does not.
-            EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
-
-    // Get the device handles for the modules, nullptr when no task is assigned.
-    int                deviceId   = -1;
-    DeviceInformation* deviceInfo = gpuTaskAssignments.initDevice(&deviceId);
-
-    // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?)
-    bool useTiming = true;
-
-    if (GMX_GPU_CUDA)
-    {
-        /* WARNING: CUDA timings are incorrect with multiple streams.
-         *          This is the main reason why they are disabled by default.
-         */
-        // TODO: Consider turning on by default when we can detect nr of streams.
-        useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
-    }
-    else if (GMX_GPU_OPENCL)
-    {
-        useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
-    }
-
-    // TODO Currently this is always built, yet DD partition code
-    // checks if it is built before using it. Probably it should
-    // become an MDModule that is made only when another module
-    // requires it (e.g. pull, CompEl, density fitting), so that we
-    // don't update the local atom sets unilaterally every step.
-    LocalAtomSetManager atomSets;
-    if (ddBuilder)
-    {
-        // TODO Pass the GPU streams to ddBuilder to use in buffer
-        // transfers (e.g. halo exchange)
-        cr->dd = ddBuilder->build(&atomSets);
-        // The builder's job is done, so destruct it
-        ddBuilder.reset(nullptr);
-        // Note that local state still does not exist yet.
-    }
-    // Ensure that all atoms within the same update group are in the
-    // same periodic image. Otherwise, a simulation that did not use
-    // update groups (e.g. a single-rank simulation) cannot always be
-    // correctly restarted in a way that does use update groups
-    // (e.g. a multi-rank simulation).
-    if (isSimulationMasterRank)
-    {
-        const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
-        if (useUpdateGroups)
-        {
-            putUpdateGroupAtomsInSamePeriodicImage(*cr->dd, mtop, globalState->box, globalState->x);
-        }
-    }
-
-    // The GPU update is decided here because we need to know whether the constraints or
-    // SETTLEs can span accross the domain borders (i.e. whether or not update groups are
-    // defined). This is only known after DD is initialized, hence decision on using GPU
-    // update is done so late.
-    try
-    {
-        const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
-        const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
-
-        useGpuForUpdate = decideWhetherToUseGpuForUpdate(
-                useDomainDecomposition, useUpdateGroups, pmeRunMode, domdecOptions.numPmeRanks > 0,
-                useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop,
-                doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
-                replExParams.exchangeInterval > 0, haveFrozenAtoms, doRerun, devFlags, mdlog);
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-
-    const bool printHostName = (cr->nnodes > 1);
-    gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
-
-    const bool disableNonbondedCalculation = (getenv("GMX_NO_NONBONDED") != nullptr);
-    if (disableNonbondedCalculation)
-    {
-        /* turn off non-bonded calculations */
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendText(
-                        "Found environment variable GMX_NO_NONBONDED.\n"
-                        "Disabling nonbonded calculations.");
-    }
-
-    MdrunScheduleWorkload runScheduleWork;
-
-    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(
-            devFlags, havePPDomainDecomposition(cr), useGpuForNonbonded, useModularSimulator,
-            doRerun, EI_ENERGY_MINIMIZATION(inputrec->eI));
-
-    // Also populates the simulation constant workload description.
-    runScheduleWork.simulationWork = createSimulationWorkload(
-            *inputrec, disableNonbondedCalculation, devFlags, useGpuForNonbonded, pmeRunMode,
-            useGpuForBonded, useGpuForUpdate, useGpuDirectHalo);
-
-    std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
-
-    if (deviceInfo != nullptr)
-    {
-        if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
-        {
-            dd_setup_dlb_resource_sharing(cr, deviceId);
-        }
-        deviceStreamManager = std::make_unique<DeviceStreamManager>(
-                *deviceInfo, havePPDomainDecomposition(cr), runScheduleWork.simulationWork, useTiming);
-    }
-
-    // If the user chose a task assignment, give them some hints
-    // where appropriate.
-    if (!userGpuTaskAssignment.empty())
-    {
-        gpuTaskAssignments.logPerformanceHints(mdlog, numDevicesToUse);
-    }
-
-    if (PAR(cr))
-    {
-        /* After possible communicator splitting in make_dd_communicators.
-         * we can set up the intra/inter node communication.
-         */
-        gmx_setup_nodecomm(fplog, cr);
-    }
-
-#if GMX_MPI
-    if (isMultiSim(ms))
-    {
-        GMX_LOG(mdlog.warning)
-                .asParagraph()
-                .appendTextFormatted(
-                        "This is simulation %d out of %d running as a composite GROMACS\n"
-                        "multi-simulation job. Setup for this simulation:\n",
-                        ms->simulationIndex_, ms->numSimulations_);
-    }
-    GMX_LOG(mdlog.warning)
-            .appendTextFormatted("Using %d MPI %s\n", cr->nnodes,
-#    if GMX_THREAD_MPI
-                                 cr->nnodes == 1 ? "thread" : "threads"
-#    else
-                                 cr->nnodes == 1 ? "process" : "processes"
-#    endif
-            );
-    fflush(stderr);
-#endif
-
-    // If mdrun -pin auto honors any affinity setting that already
-    // exists. If so, it is nice to provide feedback about whether
-    // that existing affinity setting was from OpenMP or something
-    // else, so we run this code both before and after we initialize
-    // the OpenMP support.
-    gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, FALSE);
-    /* Check and update the number of OpenMP threads requested */
-    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo_, cr, ms, physicalNodeComm.size_,
-                                            pmeRunMode, mtop, *inputrec);
-
-    gmx_omp_nthreads_init(mdlog, cr, hwinfo_->nthreads_hw_avail, physicalNodeComm.size_,
-                          hw_opt.nthreads_omp, hw_opt.nthreads_omp_pme, !thisRankHasDuty(cr, DUTY_PP));
-
-    // Enable FP exception detection, but not in
-    // Release mode and not for compilers with known buggy FP
-    // exception support (clang with any optimization) or suspected
-    // buggy FP exception support (gcc 7.* with optimization).
-#if !defined NDEBUG                                                                         \
-        && !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
-             && defined __OPTIMIZE__)
-    const bool bEnableFPE = true;
-#else
-    const bool bEnableFPE = false;
-#endif
-    // FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
-    if (bEnableFPE)
-    {
-        gmx_feenableexcept();
-    }
-
-    /* Now that we know the setup is consistent, check for efficiency */
-    check_resource_division_efficiency(hwinfo_, gpuTaskAssignments.thisRankHasAnyGpuTask(),
-                                       mdrunOptions.ntompOptionIsSet, cr, mdlog);
-
-    /* getting number of PP/PME threads on this MPI / tMPI rank.
-       PME: env variable should be read only on one node to make sure it is
-       identical everywhere;
-     */
-    const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded)
-                                                                  : gmx_omp_nthreads_get(emntPME);
-    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid, *hwinfo_->hardwareTopology,
-                                  physicalNodeComm, mdlog);
-
-    // Enable Peer access between GPUs where available
-    // Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
-    // any of the GPU communication features are active.
-    if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
-        && (runScheduleWork.simulationWork.useGpuHaloExchange
-            || runScheduleWork.simulationWork.useGpuPmePpCommunication))
-    {
-        setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
-    }
-
-    if (hw_opt.threadAffinity != ThreadAffinity::Off)
-    {
-        /* Before setting affinity, check whether the affinity has changed
-         * - which indicates that probably the OpenMP library has changed it
-         * since we first checked).
-         */
-        gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, TRUE);
-
-        int numThreadsOnThisNode, intraNodeThreadOffset;
-        analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
-                                 &intraNodeThreadOffset);
-
-        /* Set the CPU affinity */
-        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo_->hardwareTopology, numThreadsOnThisRank,
-                                numThreadsOnThisNode, intraNodeThreadOffset, nullptr);
-    }
-
-    if (mdrunOptions.timingOptions.resetStep > -1)
-    {
-        GMX_LOG(mdlog.info)
-                .asParagraph()
-                .appendText(
-                        "The -resetstep functionality is deprecated, and may be removed in a "
-                        "future version.");
-    }
-    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
-
-    if (PAR(cr))
-    {
-        /* Master synchronizes its value of reset_counters with all nodes
-         * including PME only nodes */
-        int64_t reset_counters = wcycle_get_reset_counters(wcycle);
-        gmx_bcast(sizeof(reset_counters), &reset_counters, cr->mpi_comm_mysim);
-        wcycle_set_reset_counters(wcycle, reset_counters);
-    }
-
-    // Membrane embedding must be initialized before we call init_forcerec()
-    membedHolder.initializeMembed(fplog, filenames.size(), filenames.data(), &mtop, inputrec.get(),
-                                  globalState.get(), cr, &mdrunOptions.checkpointOptions.period);
-
-    const bool               thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
-    std::unique_ptr<MDAtoms> mdAtoms;
-    std::unique_ptr<VirtualSitesHandler> vsite;
-    std::unique_ptr<GpuBonded>           gpuBonded;
-
-    t_nrnb nrnb;
-    if (thisRankHasDuty(cr, DUTY_PP))
-    {
-        mdModulesNotifier.notify(*cr);
-        mdModulesNotifier.notify(&atomSets);
-        mdModulesNotifier.notify(inputrec->pbcType);
-        mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
-        /* Initiate forcerecord */
-        fr                 = new t_forcerec;
-        fr->forceProviders = mdModules_->initForceProviders();
-        init_forcerec(fplog, mdlog, fr, inputrec.get(), &mtop, cr, box,
-                      opt2fn("-table", filenames.size(), filenames.data()),
-                      opt2fn("-tablep", filenames.size(), filenames.data()),
-                      opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
-        // Dirty hack, for fixing disres and orires should be made mdmodules
-        fr->fcdata->disres = disresdata;
-        fr->fcdata->orires = oriresdata;
-
-        // Save a handle to device stream manager to use elsewhere in the code
-        // TODO: Forcerec is not a correct place to store it.
-        fr->deviceStreamManager = deviceStreamManager.get();
-
-        if (runScheduleWork.simulationWork.useGpuPmePpCommunication && !thisRankHasDuty(cr, DUTY_PME))
-        {
-            GMX_RELEASE_ASSERT(
-                    deviceStreamManager != nullptr,
-                    "GPU device stream manager should be valid in order to use PME-PP direct "
-                    "communications.");
-            GMX_RELEASE_ASSERT(
-                    deviceStreamManager->streamIsValid(DeviceStreamType::PmePpTransfer),
-                    "GPU PP-PME stream should be valid in order to use GPU PME-PP direct "
-                    "communications.");
-            fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
-                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, deviceStreamManager->context(),
-                    deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
-        }
-
-        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec.get(), fr, cr, *hwinfo_,
-                                        runScheduleWork.simulationWork.useGpuNonbonded,
-                                        deviceStreamManager.get(), &mtop, box, wcycle);
-        // TODO: Move the logic below to a GPU bonded builder
-        if (runScheduleWork.simulationWork.useGpuBonded)
-        {
-            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
-                               "GPU device stream manager should be valid in order to use GPU "
-                               "version of bonded forces.");
-            gpuBonded = std::make_unique<GpuBonded>(
-                    mtop.ffparams, fr->ic->epsfac * fr->fudgeQQ, deviceStreamManager->context(),
-                    deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)), wcycle);
-            fr->gpuBonded = gpuBonded.get();
-        }
-
-        /* Initialize the mdAtoms structure.
-         * mdAtoms is not filled with atom data,
-         * as this can not be done now with domain decomposition.
-         */
-        mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
-        if (globalState && thisRankHasPmeGpuTask)
-        {
-            // The pinning of coordinates in the global state object works, because we only use
-            // PME on GPU without DD or on a separate PME rank, and because the local state pointer
-            // points to the global state object without DD.
-            // FIXME: MD and EM separately set up the local state - this should happen in the same
-            // function, which should also perform the pinning.
-            changePinningPolicy(&globalState->x, pme_get_pinning_policy());
-        }
-
-        /* Initialize the virtual site communication */
-        vsite = makeVirtualSitesHandler(mtop, cr, fr->pbcType);
-
-        calc_shifts(box, fr->shift_vec);
-
-        /* With periodic molecules the charge groups should be whole at start up
-         * and the virtual sites should not be far from their proper positions.
-         */
-        if (!inputrec->bContinuation && MASTER(cr)
-            && !(inputrec->pbcType != PbcType::No && inputrec->bPeriodicMols))
-        {
-            /* Make molecules whole at start of run */
-            if (fr->pbcType != PbcType::No)
-            {
-                do_pbc_first_mtop(fplog, inputrec->pbcType, box, &mtop, globalState->x.rvec_array());
-            }
-            if (vsite)
-            {
-                /* Correct initial vsite positions are required
-                 * for the initial distribution in the domain decomposition
-                 * and for the initial shell prediction.
-                 */
-                constructVirtualSitesGlobal(mtop, globalState->x);
-            }
-        }
-
-        if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
-        {
-            ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
-            ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-        }
-    }
-    else
-    {
-        /* This is a PME only node */
-
-        GMX_ASSERT(globalState == nullptr,
-                   "We don't need the state on a PME only rank and expect it to be unitialized");
-
-        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
-        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
-    }
-
-    gmx_pme_t* sepPmeData = nullptr;
-    // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
-    GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr),
-               "Double-checking that only PME-only ranks have no forcerec");
-    gmx_pme_t*& pmedata = fr ? fr->pmedata : sepPmeData;
-
-    // TODO should live in ewald module once its testing is improved
-    //
-    // Later, this program could contain kernels that might be later
-    // re-used as auto-tuning progresses, or subsequent simulations
-    // are invoked.
-    PmeGpuProgramStorage pmeGpuProgram;
-    if (thisRankHasPmeGpuTask)
-    {
-        GMX_RELEASE_ASSERT(
-                (deviceStreamManager != nullptr),
-                "GPU device stream manager should be initialized in order to use GPU for PME.");
-        GMX_RELEASE_ASSERT((deviceInfo != nullptr),
-                           "GPU device should be initialized in order to use GPU for PME.");
-        pmeGpuProgram = buildPmeGpuProgram(deviceStreamManager->context());
-    }
-
-    /* Initiate PME if necessary,
-     * either on all nodes or on dedicated PME nodes only. */
-    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
-    {
-        if (mdAtoms && mdAtoms->mdatoms())
-        {
-            nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
-            if (EVDW_PME(inputrec->vdwtype))
-            {
-                nTypePerturbed = mdAtoms->mdatoms()->nTypePerturbed;
-            }
-        }
-        if (cr->npmenodes > 0)
-        {
-            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
-            gmx_bcast(sizeof(nChargePerturbed), &nChargePerturbed, cr->mpi_comm_mysim);
-            gmx_bcast(sizeof(nTypePerturbed), &nTypePerturbed, cr->mpi_comm_mysim);
-        }
-
-        if (thisRankHasDuty(cr, DUTY_PME))
-        {
-            try
-            {
-                // TODO: This should be in the builder.
-                GMX_RELEASE_ASSERT(!runScheduleWork.simulationWork.useGpuPme
-                                           || (deviceStreamManager != nullptr),
-                                   "Device stream manager should be valid in order to use GPU "
-                                   "version of PME.");
-                GMX_RELEASE_ASSERT(
-                        !runScheduleWork.simulationWork.useGpuPme
-                                || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
-                        "GPU PME stream should be valid in order to use GPU version of PME.");
-
-                const DeviceContext* deviceContext = runScheduleWork.simulationWork.useGpuPme
-                                                             ? &deviceStreamManager->context()
-                                                             : nullptr;
-                const DeviceStream* pmeStream =
-                        runScheduleWork.simulationWork.useGpuPme
-                                ? &deviceStreamManager->stream(DeviceStreamType::Pme)
-                                : nullptr;
-
-                pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec.get(),
-                                       nChargePerturbed != 0, nTypePerturbed != 0,
-                                       mdrunOptions.reproducible, ewaldcoeff_q, ewaldcoeff_lj,
-                                       gmx_omp_nthreads_get(emntPME), pmeRunMode, nullptr,
-                                       deviceContext, pmeStream, pmeGpuProgram.get(), mdlog);
-            }
-            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-        }
-    }
-
-
-    if (EI_DYNAMICS(inputrec->eI))
-    {
-        /* Turn on signal handling on all nodes */
-        /*
-         * (A user signal from the PME nodes (if any)
-         * is communicated to the PP nodes.
-         */
-        signal_handler_install();
-    }
-
-    pull_t* pull_work = nullptr;
-    if (thisRankHasDuty(cr, DUTY_PP))
-    {
-        /* Assumes uniform use of the number of OpenMP threads */
-        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
-
-        if (inputrec->bPull)
-        {
-            /* Initialize pull code */
-            pull_work = init_pull(fplog, inputrec->pull.get(), inputrec.get(), &mtop, cr, &atomSets,
-                                  inputrec->fepvals->init_lambda);
-            if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
-            {
-                initPullHistory(pull_work, &observablesHistory);
-            }
-            if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
-            {
-                init_pull_output_files(pull_work, filenames.size(), filenames.data(), oenv, startingBehavior);
-            }
-        }
-
-        std::unique_ptr<EnforcedRotation> enforcedRotation;
-        if (inputrec->bRot)
-        {
-            /* Initialize enforced rotation code */
-            enforcedRotation = init_rot(fplog, inputrec.get(), filenames.size(), filenames.data(),
-                                        cr, &atomSets, globalState.get(), &mtop, oenv, mdrunOptions,
-                                        startingBehavior);
-        }
-
-        t_swap* swap = nullptr;
-        if (inputrec->eSwapCoords != eswapNO)
-        {
-            /* Initialize ion swapping code */
-            swap = init_swapcoords(fplog, inputrec.get(),
-                                   opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
-                                   &mtop, globalState.get(), &observablesHistory, cr, &atomSets,
-                                   oenv, mdrunOptions, startingBehavior);
-        }
-
-        /* Let makeConstraints know whether we have essential dynamics constraints. */
-        auto constr = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr,
-                                      ms, &nrnb, wcycle, fr->bMolPBC);
-
-        /* Energy terms and groups */
-        gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
-                             inputrec->fepvals->n_lambda);
-
-        // cos acceleration is only supported by md, but older tpr
-        // files might still combine it with other integrators
-        GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == eiMD,
-                           "cos_acceleration is only supported by integrator=md");
-
-        /* Kinetic energy data */
-        gmx_ekindata_t ekind;
-        init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
-
-        /* Set up interactive MD (IMD) */
-        auto imdSession =
-                makeImdSession(inputrec.get(), cr, wcycle, &enerd, ms, &mtop, mdlog,
-                               MASTER(cr) ? globalState->x.rvec_array() : nullptr, filenames.size(),
-                               filenames.data(), oenv, mdrunOptions.imdOptions, startingBehavior);
-
-        if (DOMAINDECOMP(cr))
-        {
-            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
-            /* This call is not included in init_domain_decomposition mainly
-             * because fr->cginfo_mb is set later.
-             */
-            dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec.get(),
-                            domdecOptions.checkBondedInteractions, fr->cginfo_mb);
-        }
-
-        if (runScheduleWork.simulationWork.useGpuBufferOps)
-        {
-            fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
-                    deviceStreamManager->context(),
-                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), wcycle);
-            fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
-                    deviceStreamManager->context(),
-                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), wcycle);
-        }
-
-        std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
-        if (gpusWereDetected
-            && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
-                || runScheduleWork.simulationWork.useGpuBufferOps))
-        {
-            GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
-                                                      ? GpuApiCallBehavior::Async
-                                                      : GpuApiCallBehavior::Sync;
-            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
-                               "GPU device stream manager should be initialized to use GPU.");
-            stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
-            fr->stateGpu = stateGpu.get();
-        }
-
-        GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
-        SimulatorBuilder simulatorBuilder;
-
-        simulatorBuilder.add(SimulatorStateData(globalState.get(), &observablesHistory, &enerd, &ekind));
-        simulatorBuilder.add(std::move(membedHolder));
-        simulatorBuilder.add(std::move(stopHandlerBuilder_));
-        simulatorBuilder.add(SimulatorConfig(mdrunOptions, startingBehavior, &runScheduleWork));
-
-
-        simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv));
-        simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle));
-        simulatorBuilder.add(ConstraintsParam(
-                constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
-                vsite.get()));
-        // TODO: Separate `fr` to a separate add, and make the `build` handle the coupling sensibly.
-        simulatorBuilder.add(LegacyInput(static_cast<int>(filenames.size()), filenames.data(),
-                                         inputrec.get(), fr));
-        simulatorBuilder.add(ReplicaExchangeParameters(replExParams));
-        simulatorBuilder.add(InteractiveMD(imdSession.get()));
-        simulatorBuilder.add(SimulatorModules(mdModules_->outputProvider(), mdModules_->notifier()));
-        simulatorBuilder.add(CenterOfMassPulling(pull_work));
-        // Todo move to an MDModule
-        simulatorBuilder.add(IonSwapping(swap));
-        simulatorBuilder.add(TopologyData(&mtop, mdAtoms.get()));
-        simulatorBuilder.add(BoxDeformationHandle(deform.get()));
-        simulatorBuilder.add(std::move(modularSimulatorCheckpointData));
-
-        // build and run simulator object based on user-input
-        auto simulator = simulatorBuilder.build(useModularSimulator);
-        simulator->run();
-
-        if (fr->pmePpCommGpu)
-        {
-            // destroy object since it is no longer required. (This needs to be done while the GPU context still exists.)
-            fr->pmePpCommGpu.reset();
-        }
-
-        if (inputrec->bPull)
-        {
-            finish_pull(pull_work);
-        }
-        finish_swapcoords(swap);
-    }
-    else
-    {
-        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
-        /* do PME only */
-        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
-        gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec.get(), pmeRunMode,
-                    deviceStreamManager.get());
-    }
-
-    wallcycle_stop(wcycle, ewcRUN);
-
-    /* Finish up, write some stuff
-     * if rerunMD, don't write last frame again
-     */
-    finish_run(fplog, mdlog, cr, inputrec.get(), &nrnb, wcycle, walltime_accounting,
-               fr ? fr->nbv.get() : nullptr, pmedata, EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
-
-    // clean up cycle counter
-    wallcycle_destroy(wcycle);
-
-    deviceStreamManager.reset(nullptr);
-    // Free PME data
-    if (pmedata)
-    {
-        gmx_pme_destroy(pmedata);
-        pmedata = nullptr;
-    }
-
-    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
-    // before we destroy the GPU context(s)
-    // Pinned buffers are associated with contexts in CUDA.
-    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
-    mdAtoms.reset(nullptr);
-    globalState.reset(nullptr);
-    mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
-    gpuBonded.reset(nullptr);
-    /* Free pinned buffers in *fr */
-    delete fr;
-    fr = nullptr;
-    // TODO convert to C++ so we can get rid of these frees
-    sfree(disresdata);
-    sfree(oriresdata);
-
-    if (!hwinfo_->deviceInfoList.empty())
-    {
-        /* stop the GPU profiler (only CUDA) */
-        stopGpuProfiler();
-    }
-
-    /* With tMPI we need to wait for all ranks to finish deallocation before
-     * destroying the CUDA context as some tMPI ranks may be sharing
-     * GPU and context.
-     *
-     * This is not a concern in OpenCL where we use one context per rank.
-     *
-     * Note: it is safe to not call the barrier on the ranks which do not use GPU,
-     * but it is easier and more futureproof to call it on the whole node.
-     *
-     * Note that this function needs to be called even if GPUs are not used
-     * in this run because the PME ranks have no knowledge of whether GPUs
-     * are used or not, but all ranks need to enter the barrier below.
-     * \todo Remove this physical node barrier after making sure
-     * that it's not needed anymore (with a shared GPU run).
-     */
-    if (GMX_THREAD_MPI)
-    {
-        physicalNodeComm.barrier();
-    }
-    releaseDevice(deviceInfo);
-
-    /* Does what it says */
-    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
-    walltime_accounting_destroy(walltime_accounting);
-
-    // Ensure log file content is written
-    if (logFileHandle)
-    {
-        gmx_fio_flush(logFileHandle);
-    }
-
-    /* Reset FPEs (important for unit tests) by disabling them. Assumes no
-     * exceptions were enabled before function was called. */
-    if (bEnableFPE)
-    {
-        gmx_fedisableexcept();
-    }
-
-    auto rc = static_cast<int>(gmx_get_stop_condition());
-
-#if GMX_THREAD_MPI
-    /* we need to join all threads. The sub-threads join when they
-       exit this function, but the master thread needs to be told to
-       wait for that. */
-    if (MASTER(cr))
-    {
-        tMPI_Finalize();
-    }
-#endif
-    return rc;
-} // namespace gmx
-
-Mdrunner::~Mdrunner()
-{
-    // Clean up of the Manager.
-    // This will end up getting called on every thread-MPI rank, which is unnecessary,
-    // but okay as long as threads synchronize some time before adding or accessing
-    // a new set of restraints.
-    if (restraintManager_)
-    {
-        restraintManager_->clear();
-        GMX_ASSERT(restraintManager_->countRestraints() == 0,
-                   "restraints added during runner life time should be cleared at runner "
-                   "destruction.");
-    }
-};
-
-void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller, const std::string& name)
-{
-    GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
-    // Not sure if this should be logged through the md logger or something else,
-    // but it is helpful to have some sort of INFO level message sent somewhere.
-    //    std::cout << "Registering restraint named " << name << std::endl;
-
-    // When multiple restraints are used, it may be wasteful to register them separately.
-    // Maybe instead register an entire Restraint Manager as a force provider.
-    restraintManager_->addToSpec(std::move(puller), name);
-}
-
-Mdrunner::Mdrunner(std::unique_ptr<MDModules> mdModules) : mdModules_(std::move(mdModules)) {}
-
-Mdrunner::Mdrunner(Mdrunner&&) noexcept = default;
-
-//NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265 in CentOS 7
-Mdrunner& Mdrunner::operator=(Mdrunner&& /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
-
-class Mdrunner::BuilderImplementation
-{
-public:
-    BuilderImplementation() = delete;
-    BuilderImplementation(std::unique_ptr<MDModules> mdModules, compat::not_null<SimulationContext*> context);
-    ~BuilderImplementation();
-
-    BuilderImplementation& setExtraMdrunOptions(const MdrunOptions& options,
-                                                real                forceWarningThreshold,
-                                                StartingBehavior    startingBehavior);
-
-    void addHardwareDetectionResult(const gmx_hw_info_t* hwinfo);
-
-    void addDomdec(const DomdecOptions& options);
-
-    void addInput(SimulationInputHandle inputHolder);
-
-    void addVerletList(int nstlist);
-
-    void addReplicaExchange(const ReplicaExchangeParameters& params);
-
-    void addNonBonded(const char* nbpu_opt);
-
-    void addPME(const char* pme_opt_, const char* pme_fft_opt_);
-
-    void addBondedTaskAssignment(const char* bonded_opt);
-
-    void addUpdateTaskAssignment(const char* update_opt);
-
-    void addHardwareOptions(const gmx_hw_opt_t& hardwareOptions);
-
-    void addFilenames(ArrayRef<const t_filenm> filenames);
-
-    void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
-
-    void addLogFile(t_fileio* logFileHandle);
-
-    void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
-
-    Mdrunner build();
-
-private:
-    // Default parameters copied from runner.h
-    // \todo Clarify source(s) of default parameters.
-
-    const char* nbpu_opt_    = nullptr;
-    const char* pme_opt_     = nullptr;
-    const char* pme_fft_opt_ = nullptr;
-    const char* bonded_opt_  = nullptr;
-    const char* update_opt_  = nullptr;
-
-    MdrunOptions mdrunOptions_;
-
-    DomdecOptions domdecOptions_;
-
-    ReplicaExchangeParameters replicaExchangeParameters_;
-
-    //! Command-line override for the duration of a neighbor list with the Verlet scheme.
-    int nstlist_ = 0;
-
-    //! World communicator, used for hardware detection and task assignment
-    MPI_Comm libraryWorldCommunicator_ = MPI_COMM_NULL;
-
-    //! Multisim communicator handle.
-    gmx_multisim_t* multiSimulation_;
-
-    //! mdrun communicator
-    MPI_Comm simulationCommunicator_ = MPI_COMM_NULL;
-
-    //! Print a warning if any force is larger than this (in kJ/mol nm).
-    real forceWarningThreshold_ = -1;
-
-    //! Whether the simulation will start afresh, or restart with/without appending.
-    StartingBehavior startingBehavior_ = StartingBehavior::NewSimulation;
-
-    //! The modules that comprise the functionality of mdrun.
-    std::unique_ptr<MDModules> mdModules_;
-
-    //! Detected hardware.
-    const gmx_hw_info_t* hwinfo_ = nullptr;
-
-    //! \brief Parallelism information.
-    gmx_hw_opt_t hardwareOptions_;
-
-    //! filename options for simulation.
-    ArrayRef<const t_filenm> filenames_;
-
-    /*! \brief Handle to output environment.
-     *
-     * \todo gmx_output_env_t needs lifetime management.
-     */
-    gmx_output_env_t* outputEnvironment_ = nullptr;
-
-    /*! \brief Non-owning handle to MD log file.
-     *
-     * \todo Context should own output facilities for client.
-     * \todo Improve log file handle management.
-     * \internal
-     * Code managing the FILE* relies on the ability to set it to
-     * nullptr to check whether the filehandle is valid.
-     */
-    t_fileio* logFileHandle_ = nullptr;
-
-    /*!
-     * \brief Builder for simulation stop signal handler.
-     */
-    std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
-
-    /*!
-     * \brief Sources for initial simulation state.
-     *
-     * See issue #3652 for near-term refinements to the SimulationInput interface.
-     *
-     * See issue #3379 for broader discussion on API aspects of simulation inputs and outputs.
-     */
-    SimulationInputHandle inputHolder_;
-};
-
-Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules> mdModules,
-                                                       compat::not_null<SimulationContext*> context) :
-    mdModules_(std::move(mdModules))
-{
-    libraryWorldCommunicator_ = context->libraryWorldCommunicator_;
-    simulationCommunicator_   = context->simulationCommunicator_;
-    multiSimulation_          = context->multiSimulation_.get();
-}
-
-Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
-
-Mdrunner::BuilderImplementation&
-Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions&    options,
-                                                      const real             forceWarningThreshold,
-                                                      const StartingBehavior startingBehavior)
-{
-    mdrunOptions_          = options;
-    forceWarningThreshold_ = forceWarningThreshold;
-    startingBehavior_      = startingBehavior;
-    return *this;
-}
-
-void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions& options)
-{
-    domdecOptions_ = options;
-}
-
-void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
-{
-    nstlist_ = nstlist;
-}
-
-void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters& params)
-{
-    replicaExchangeParameters_ = params;
-}
-
-Mdrunner Mdrunner::BuilderImplementation::build()
-{
-    auto newRunner = Mdrunner(std::move(mdModules_));
-
-    newRunner.mdrunOptions     = mdrunOptions_;
-    newRunner.pforce           = forceWarningThreshold_;
-    newRunner.startingBehavior = startingBehavior_;
-    newRunner.domdecOptions    = domdecOptions_;
-
-    // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
-    newRunner.hw_opt = hardwareOptions_;
-
-    // No invariant to check. This parameter exists to optionally override other behavior.
-    newRunner.nstlist_cmdline = nstlist_;
-
-    newRunner.replExParams = replicaExchangeParameters_;
-
-    newRunner.filenames = filenames_;
-
-    newRunner.libraryWorldCommunicator = libraryWorldCommunicator_;
-
-    newRunner.simulationCommunicator = simulationCommunicator_;
-
-    // nullptr is a valid value for the multisim handle
-    newRunner.ms = multiSimulation_;
-
-    if (hwinfo_)
-    {
-        newRunner.hwinfo_ = hwinfo_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError(
-                "MdrunnerBuilder::addHardwareDetectionResult() is required before build()"));
-    }
-
-    if (inputHolder_)
-    {
-        newRunner.inputHolder_ = std::move(inputHolder_);
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError("MdrunnerBuilder::addInput() is required before build()."));
-    }
-
-    // \todo Clarify ownership and lifetime management for gmx_output_env_t
-    // \todo Update sanity checking when output environment has clearly specified invariants.
-    // Initialization and default values for oenv are not well specified in the current version.
-    if (outputEnvironment_)
-    {
-        newRunner.oenv = outputEnvironment_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError(
-                "MdrunnerBuilder::addOutputEnvironment() is required before build()"));
-    }
-
-    newRunner.logFileHandle = logFileHandle_;
-
-    if (nbpu_opt_)
-    {
-        newRunner.nbpu_opt = nbpu_opt_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
-    }
-
-    if (pme_opt_ && pme_fft_opt_)
-    {
-        newRunner.pme_opt     = pme_opt_;
-        newRunner.pme_fft_opt = pme_fft_opt_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
-    }
-
-    if (bonded_opt_)
-    {
-        newRunner.bonded_opt = bonded_opt_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError(
-                "MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
-    }
-
-    if (update_opt_)
-    {
-        newRunner.update_opt = update_opt_;
-    }
-    else
-    {
-        GMX_THROW(gmx::APIError(
-                "MdrunnerBuilder::addUpdateTaskAssignment() is required before build()  "));
-    }
-
-
-    newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
-
-    if (stopHandlerBuilder_)
-    {
-        newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
-    }
-    else
-    {
-        newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
-    }
-
-    return newRunner;
-}
-
-void Mdrunner::BuilderImplementation::addHardwareDetectionResult(const gmx_hw_info_t* hwinfo)
-{
-    hwinfo_ = hwinfo;
-}
-
-void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
-{
-    nbpu_opt_ = nbpu_opt;
-}
-
-void Mdrunner::BuilderImplementation::addPME(const char* pme_opt, const char* pme_fft_opt)
-{
-    pme_opt_     = pme_opt;
-    pme_fft_opt_ = pme_fft_opt;
-}
-
-void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
-{
-    bonded_opt_ = bonded_opt;
-}
-
-void Mdrunner::BuilderImplementation::addUpdateTaskAssignment(const char* update_opt)
-{
-    update_opt_ = update_opt;
-}
-
-void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t& hardwareOptions)
-{
-    hardwareOptions_ = hardwareOptions;
-}
-
-void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
-{
-    filenames_ = filenames;
-}
-
-void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
-{
-    outputEnvironment_ = outputEnvironment;
-}
-
-void Mdrunner::BuilderImplementation::addLogFile(t_fileio* logFileHandle)
-{
-    logFileHandle_ = logFileHandle;
-}
-
-void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
-{
-    stopHandlerBuilder_ = std::move(builder);
-}
-
-void Mdrunner::BuilderImplementation::addInput(SimulationInputHandle inputHolder)
-{
-    inputHolder_ = std::move(inputHolder);
-}
-
-MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules,
-                                 compat::not_null<SimulationContext*> context) :
-    impl_{ std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules), context) }
-{
-}
-
-MdrunnerBuilder::~MdrunnerBuilder() = default;
-
-MdrunnerBuilder& MdrunnerBuilder::addHardwareDetectionResult(const gmx_hw_info_t* hwinfo)
-{
-    impl_->addHardwareDetectionResult(hwinfo);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addSimulationMethod(const MdrunOptions&    options,
-                                                      real                   forceWarningThreshold,
-                                                      const StartingBehavior startingBehavior)
-{
-    impl_->setExtraMdrunOptions(options, forceWarningThreshold, startingBehavior);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addDomainDecomposition(const DomdecOptions& options)
-{
-    impl_->addDomdec(options);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addNeighborList(int nstlist)
-{
-    impl_->addVerletList(nstlist);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters& params)
-{
-    impl_->addReplicaExchange(params);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
-{
-    impl_->addNonBonded(nbpu_opt);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addElectrostatics(const char* pme_opt, const char* pme_fft_opt)
-{
-    // The builder method may become more general in the future, but in this version,
-    // parameters for PME electrostatics are both required and the only parameters
-    // available.
-    if (pme_opt && pme_fft_opt)
-    {
-        impl_->addPME(pme_opt, pme_fft_opt);
-    }
-    else
-    {
-        GMX_THROW(
-                gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
-    }
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
-{
-    impl_->addBondedTaskAssignment(bonded_opt);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addUpdateTaskAssignment(const char* update_opt)
-{
-    impl_->addUpdateTaskAssignment(update_opt);
-    return *this;
-}
-
-Mdrunner MdrunnerBuilder::build()
-{
-    return impl_->build();
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t& hardwareOptions)
-{
-    impl_->addHardwareOptions(hardwareOptions);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
-{
-    impl_->addFilenames(filenames);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
-{
-    impl_->addOutputEnvironment(outputEnvironment);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addLogFile(t_fileio* logFileHandle)
-{
-    impl_->addLogFile(logFileHandle);
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
-{
-    impl_->addStopHandlerBuilder(std::move(builder));
-    return *this;
-}
-
-MdrunnerBuilder& MdrunnerBuilder::addInput(SimulationInputHandle input)
-{
-    impl_->addInput(std::move(input));
-    return *this;
-}
-
-MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder&&) noexcept = default;
-
-MdrunnerBuilder& MdrunnerBuilder::operator=(MdrunnerBuilder&&) noexcept = default;
-
-} // namespace gmx