updated gromacs 2023 patch

plumed · Jun 12, 2024 · d52eb23 · d52eb23 · PlumedBot · Jun 12, 2024
1 parent 9b6d448
commit d52eb23
Show file tree

Hide file tree

Showing 36 changed files with 223 additions and 109 deletions.
diff --git a/patches/gromacs-2020.7.config b/patches/gromacs-2020.7.config
@@ -9,6 +9,10 @@ function plumed_patch_info(){
 cat << EOF
 PLUMED can be incorporated into gromacs using the standard patching procedure.
 Patching must be done in the gromacs root directory  _before_ the cmake command is invoked.
+PLUMED is not compatible with the internal multi-threading implementation of GROMACS so you need
+to configure gromacs as
+
+cmake -DGMX_THREAD_MPI=OFF and add -DGMX_MPI=ON if you want to use MPI.  
 
 On clusters you may want to patch gromacs using the static version of plumed, in this case
 building gromacs can result in multiple errors. One possible solution is to configure gromacs

diff --git a/patches/gromacs-2021.7.config b/patches/gromacs-2021.7.config
@@ -10,6 +10,12 @@ cat << EOF
 PLUMED can be incorporated into gromacs using the standard patching procedure.
 Patching must be done in the gromacs root directory  _before_ the cmake command is invoked.
 
+PLUMED is not compatible with the internal multi-threading implementation of GROMACS so you need
+to configure gromacs as
+
+cmake -DGMX_THREAD_MPI=OFF and add -DGMX_MPI=ON if you want to use MPI.  
+
+
 On clusters you may want to patch gromacs using the static version of plumed, in this case
 building gromacs can result in multiple errors. One possible solution is to configure gromacs
 with these additional options:

diff --git a/patches/gromacs-2022.5.config b/patches/gromacs-2022.5.config
@@ -10,6 +10,11 @@ cat << EOF
 PLUMED can be incorporated into gromacs using the standard patching procedure.
 Patching must be done in the gromacs root directory  _before_ the cmake command is invoked.
 
+PLUMED is not compatible with the internal multi-threading implementation of GROMACS so you need
+to configure gromacs as
+
+cmake -DGMX_THREAD_MPI=OFF and add -DGMX_MPI=ON if you want to use MPI.  
+
 On clusters you may want to patch gromacs using the static version of plumed, in this case
 building gromacs can result in multiple errors. One possible solution is to configure gromacs
 with these additional options:

diff --git a/patches/gromacs-2023.config → patches/gromacs-2023.5.config b/patches/gromacs-2023.config → patches/gromacs-2023.5.config
@@ -10,6 +10,11 @@ cat << EOF
 PLUMED can be incorporated into gromacs using the standard patching procedure.
 Patching must be done in the gromacs root directory  _before_ the cmake command is invoked.
 
+PLUMED is not compatible with the internal multi-threading implementation of GROMACS so you need
+to configure gromacs as
+
+cmake -DGMX_THREAD_MPI=OFF and add -DGMX_MPI=ON if you want to use MPI.  
+
 On clusters you may want to patch gromacs using the static version of plumed, in this case
 building gromacs can result in multiple errors. One possible solution is to configure gromacs
 with these additional options:

diff --git a/...macs-2023.diff/cmake/gmxVersionInfo.cmake → ...cs-2023.5.diff/cmake/gmxVersionInfo.cmake b/...macs-2023.diff/cmake/gmxVersionInfo.cmake → ...cs-2023.5.diff/cmake/gmxVersionInfo.cmake
diff --git a/...diff/cmake/gmxVersionInfo.cmake.preplumed → ...diff/cmake/gmxVersionInfo.cmake.preplumed b/...diff/cmake/gmxVersionInfo.cmake.preplumed → ...diff/cmake/gmxVersionInfo.cmake.preplumed
diff --git a/...macs-2023.diff/src/gromacs/CMakeLists.txt → ...cs-2023.5.diff/src/gromacs/CMakeLists.txt b/...macs-2023.diff/src/gromacs/CMakeLists.txt → ...cs-2023.5.diff/src/gromacs/CMakeLists.txt
@@ -202,7 +202,7 @@ if(GMX_GPU_FFT_VKFFT)
     target_link_libraries(libgromacs PRIVATE VkFFT)
 endif()
 if(GMX_GPU_FFT_ROCFFT)
-    target_link_libraries(libgromacs PUBLIC roc::rocfft)
+    target_link_libraries(libgromacs PRIVATE roc::rocfft)
 endif()
 
 target_link_libraries(libgromacs PRIVATE $<BUILD_INTERFACE:common>)

diff --git a/...diff/src/gromacs/CMakeLists.txt.preplumed → ...diff/src/gromacs/CMakeLists.txt.preplumed b/...diff/src/gromacs/CMakeLists.txt.preplumed → ...diff/src/gromacs/CMakeLists.txt.preplumed
@@ -200,7 +200,7 @@ if(GMX_GPU_FFT_VKFFT)
     target_link_libraries(libgromacs PRIVATE VkFFT)
 endif()
 if(GMX_GPU_FFT_ROCFFT)
-    target_link_libraries(libgromacs PUBLIC roc::rocfft)
+    target_link_libraries(libgromacs PRIVATE roc::rocfft)
 endif()
 
 target_link_libraries(libgromacs PRIVATE $<BUILD_INTERFACE:common>)

diff --git a/...-2023.diff/src/gromacs/mdlib/expanded.cpp → ...023.5.diff/src/gromacs/mdlib/expanded.cpp b/...-2023.diff/src/gromacs/mdlib/expanded.cpp → ...023.5.diff/src/gromacs/mdlib/expanded.cpp
diff --git a/.../src/gromacs/mdlib/expanded.cpp.preplumed → .../src/gromacs/mdlib/expanded.cpp.preplumed b/.../src/gromacs/mdlib/expanded.cpp.preplumed → .../src/gromacs/mdlib/expanded.cpp.preplumed
diff --git a/...cs-2023.diff/src/gromacs/mdlib/expanded.h → ...-2023.5.diff/src/gromacs/mdlib/expanded.h b/...cs-2023.diff/src/gromacs/mdlib/expanded.h → ...-2023.5.diff/src/gromacs/mdlib/expanded.h
diff --git a/...ff/src/gromacs/mdlib/expanded.h.preplumed → ...ff/src/gromacs/mdlib/expanded.h.preplumed b/...ff/src/gromacs/mdlib/expanded.h.preplumed → ...ff/src/gromacs/mdlib/expanded.h.preplumed
diff --git a/...-2023.diff/src/gromacs/mdlib/sim_util.cpp → ...023.5.diff/src/gromacs/mdlib/sim_util.cpp b/...-2023.diff/src/gromacs/mdlib/sim_util.cpp → ...023.5.diff/src/gromacs/mdlib/sim_util.cpp
@@ -734,21 +734,23 @@ static void computeSpecialForces(FILE*                          fplog,
  * \param[in]  stepWork             Step schedule flags
  * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in the device memory.
  * \param[in]  lambdaQ              The Coulomb lambda of the current state.
+ * \param[in]  useMdGpuGraph        Whether MD GPU Graph is in use.
  * \param[in]  wcycle               The wallcycle structure
  */
 static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                       const matrix          box,
                                       const StepWorkload&   stepWork,
                                       GpuEventSynchronizer* xReadyOnDevice,
                                       const real            lambdaQ,
+                                      bool                  useMdGpuGraph,
                                       gmx_wallcycle*        wcycle)
 {
     wallcycle_start(wcycle, WallCycleCounter::PmeGpuMesh);
     pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
     bool                           useGpuDirectComm         = false;
     gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
     pme_gpu_launch_spread(
-            pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
+            pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu, useMdGpuGraph);
     wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
 }
 
@@ -1419,8 +1421,19 @@ void do_force(FILE*                               fplog,
 
     const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
 
-    runScheduleWork->stepWork = setupStepWorkload(
-            legacyFlags, inputrec.mtsLevels, step, runScheduleWork->domainWork, simulationWork);
+    if ((legacyFlags & GMX_FORCE_NS) != 0) // Update domainWork on Neighbor Search steps
+    {
+        if (fr->listedForcesGpu)
+        {
+            fr->listedForcesGpu->updateHaveInteractions(top->idef);
+        }
+        runScheduleWork->domainWork =
+                setupDomainLifetimeWorkload(inputrec, *fr, pull_work, ed, *mdatoms, simulationWork);
+    }
+    const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
+
+    runScheduleWork->stepWork =
+            setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, domainWork, simulationWork);
     const StepWorkload& stepWork = runScheduleWork->stepWork;
 
     if (stepWork.doNeighborSearch && gmx::needStateGpu(simulationWork))
@@ -1574,11 +1587,10 @@ void do_force(FILE*                               fplog,
                            stepWork,
                            localXReadyOnDevice,
                            lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
+                           simulationWork.useMdGpuGraph,
                            wcycle);
     }
 
-    const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
-
     /* do gridding for pair search */
     if (stepWork.doNeighborSearch)
     {
@@ -1643,11 +1655,6 @@ void do_force(FILE*                               fplog,
             }
         }
 
-        // Need to run after the GPU-offload bonded interaction lists
-        // are set up to be able to determine whether there is bonded work.
-        runScheduleWork->domainWork =
-                setupDomainLifetimeWorkload(inputrec, *fr, pull_work, ed, *mdatoms, simulationWork);
-
         wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
         wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
         /* Note that with a GPU the launch overhead of the list transfer is not timed separately */

diff --git a/.../src/gromacs/mdlib/sim_util.cpp.preplumed → .../src/gromacs/mdlib/sim_util.cpp.preplumed b/.../src/gromacs/mdlib/sim_util.cpp.preplumed → .../src/gromacs/mdlib/sim_util.cpp.preplumed
@@ -728,21 +728,23 @@ static void computeSpecialForces(FILE*                          fplog,
  * \param[in]  stepWork             Step schedule flags
  * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in the device memory.
  * \param[in]  lambdaQ              The Coulomb lambda of the current state.
+ * \param[in]  useMdGpuGraph        Whether MD GPU Graph is in use.
  * \param[in]  wcycle               The wallcycle structure
  */
 static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                       const matrix          box,
                                       const StepWorkload&   stepWork,
                                       GpuEventSynchronizer* xReadyOnDevice,
                                       const real            lambdaQ,
+                                      bool                  useMdGpuGraph,
                                       gmx_wallcycle*        wcycle)
 {
     wallcycle_start(wcycle, WallCycleCounter::PmeGpuMesh);
     pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
     bool                           useGpuDirectComm         = false;
     gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
     pme_gpu_launch_spread(
-            pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
+            pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu, useMdGpuGraph);
     wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
 }
 
@@ -1413,8 +1415,19 @@ void do_force(FILE*                               fplog,
 
     const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
 
-    runScheduleWork->stepWork = setupStepWorkload(
-            legacyFlags, inputrec.mtsLevels, step, runScheduleWork->domainWork, simulationWork);
+    if ((legacyFlags & GMX_FORCE_NS) != 0) // Update domainWork on Neighbor Search steps
+    {
+        if (fr->listedForcesGpu)
+        {
+            fr->listedForcesGpu->updateHaveInteractions(top->idef);
+        }
+        runScheduleWork->domainWork =
+                setupDomainLifetimeWorkload(inputrec, *fr, pull_work, ed, *mdatoms, simulationWork);
+    }
+    const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
+
+    runScheduleWork->stepWork =
+            setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, domainWork, simulationWork);
     const StepWorkload& stepWork = runScheduleWork->stepWork;
 
     if (stepWork.doNeighborSearch && gmx::needStateGpu(simulationWork))
@@ -1568,11 +1581,10 @@ void do_force(FILE*                               fplog,
                            stepWork,
                            localXReadyOnDevice,
                            lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
+                           simulationWork.useMdGpuGraph,
                            wcycle);
     }
 
-    const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
-
     /* do gridding for pair search */
     if (stepWork.doNeighborSearch)
     {
@@ -1637,11 +1649,6 @@ void do_force(FILE*                               fplog,
             }
         }
 
-        // Need to run after the GPU-offload bonded interaction lists
-        // are set up to be able to determine whether there is bonded work.
-        runScheduleWork->domainWork =
-                setupDomainLifetimeWorkload(inputrec, *fr, pull_work, ed, *mdatoms, simulationWork);
-
         wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
         wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
         /* Note that with a GPU the launch overhead of the list transfer is not timed separately */

diff --git a/.../src/gromacs/mdrun/legacymdrunoptions.cpp → .../src/gromacs/mdrun/legacymdrunoptions.cpp b/.../src/gromacs/mdrun/legacymdrunoptions.cpp → .../src/gromacs/mdrun/legacymdrunoptions.cpp
diff --git a/...cs/mdrun/legacymdrunoptions.cpp.preplumed → ...cs/mdrun/legacymdrunoptions.cpp.preplumed b/...cs/mdrun/legacymdrunoptions.cpp.preplumed → ...cs/mdrun/legacymdrunoptions.cpp.preplumed
diff --git a/...ff/src/gromacs/mdrun/legacymdrunoptions.h → ...ff/src/gromacs/mdrun/legacymdrunoptions.h b/...ff/src/gromacs/mdrun/legacymdrunoptions.h → ...ff/src/gromacs/mdrun/legacymdrunoptions.h
@@ -232,7 +232,8 @@ class LegacyMdrunOptions
           FALSE,
           etSTR,
           { &userGpuTaskAssignment },
-          "List of GPU device IDs, mapping each PP task on each node to a device" },
+          "List of GPU device IDs, mapping each task on a node to a device. "
+          "Tasks include PP and PME (if present)." },
         { "-ddcheck",
           FALSE,
           etBOOL,

diff --git a/...macs/mdrun/legacymdrunoptions.h.preplumed → ...macs/mdrun/legacymdrunoptions.h.preplumed b/...macs/mdrun/legacymdrunoptions.h.preplumed → ...macs/mdrun/legacymdrunoptions.h.preplumed
@@ -220,7 +220,8 @@ public:
           FALSE,
           etSTR,
           { &userGpuTaskAssignment },
-          "List of GPU device IDs, mapping each PP task on each node to a device" },
+          "List of GPU device IDs, mapping each task on a node to a device. "
+          "Tasks include PP and PME (if present)." },
         { "-ddcheck",
           FALSE,
           etBOOL,

diff --git a/...romacs-2023.diff/src/gromacs/mdrun/md.cpp → ...macs-2023.5.diff/src/gromacs/mdrun/md.cpp b/...romacs-2023.diff/src/gromacs/mdrun/md.cpp → ...macs-2023.5.diff/src/gromacs/mdrun/md.cpp
@@ -999,16 +999,28 @@ void gmx::LegacySimulator::do_md()
         // exchange).
         if (useGpuForUpdate && bNS && !bFirstStep && !bExchanged)
         {
+            if (usedMdGpuGraphLastStep)
+            {
+                // Wait on coordinates produced from GPU graph
+                stateGpu->waitCoordinatesUpdatedOnDevice();
+            }
             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
         }
 
-        // We only need to calculate virtual velocities if we are writing them in the current step
+        // We need to calculate virtual velocities if we are writing them in the current step.
+        // They also need to be periodically updated. Every 1000 steps is arbitrary, but a reasonable number.
+        // The reason why the velocities need to be updated regularly is that the virtual site coordinates
+        // are updated using these velocities during integration. Those coordinates are used for, e.g., domain
+        // decomposition. Before computing any forces the positions of the virtual sites are recalculated.
+        // This fixes a bug, #4879, which was introduced in MR !979.
+        const int  c_virtualSiteVelocityUpdateInterval = 1000;
         const bool needVirtualVelocitiesThisStep =
                 (vsite != nullptr)
-                && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
+                && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()
+                    || do_per_step(step, c_virtualSiteVelocityUpdateInterval));
 
         if (vsite != nullptr)
         {
@@ -1140,7 +1152,7 @@ void gmx::LegacySimulator::do_md()
         if (plumedswitch && bHREX) {
           // gmx_enerdata_t *hrex_enerd;
           int nlambda = enerd->foreignLambdaTerms.numLambdas();
-          gmx_enerdata_t hrex_enerd(enerd->grpp.nener, nlambda == 0 ? 0 : nlambda - 1);
+          gmx_enerdata_t hrex_enerd(enerd->grpp.nener, nlambda == 0 ? nullptr: &inputrec->fepvals->all_lambda);
           int repl  = -1;
           int nrepl = -1;
           if (MAIN(cr)){
@@ -1340,7 +1352,9 @@ void gmx::LegacySimulator::do_md()
 
         if (simulationWork.useMdGpuGraph)
         {
-            if (bNS)
+            // Reset graph on search step (due to changing neighbour list etc)
+            // or virial step (due to changing shifts and box).
+            if (bNS || bCalcVir)
             {
                 fr->mdGraph[MdGraphEvenOrOddStep::EvenStep]->reset();
                 fr->mdGraph[MdGraphEvenOrOddStep::OddStep]->reset();
@@ -1349,7 +1363,7 @@ void gmx::LegacySimulator::do_md()
             {
                 mdGraph->setUsedGraphLastStep(usedMdGpuGraphLastStep);
                 bool canUseMdGpuGraphThisStep =
-                        !bCalcVir && !doTemperatureScaling && !doParrinelloRahman && !bGStat
+                        !bNS && !bCalcVir && !doTemperatureScaling && !doParrinelloRahman && !bGStat
                         && !needHalfStepKineticEnergy && !do_per_step(step, ir->nstxout)
                         && !do_per_step(step, ir->nstxout_compressed)
                         && !do_per_step(step, ir->nstvout) && !do_per_step(step, ir->nstfout)
@@ -1825,12 +1839,14 @@ void gmx::LegacySimulator::do_md()
                 else
                 {
                     /* With multiple time stepping we need to do an additional normal
-                     * update step to obtain the virial, as the actual MTS integration
+                     * update step to obtain the virial and dH/dl, as the actual MTS integration
                      * using an acceleration where the slow forces are multiplied by mtsFactor.
                      * Using that acceleration would result in a virial with the slow
                      * force contribution would be a factor mtsFactor too large.
                      */
-                    if (simulationWork.useMts && bCalcVir && constr != nullptr)
+                    const bool separateVirialConstraining =
+                            (simulationWork.useMts && (bCalcVir || computeDHDL) && constr != nullptr);
+                    if (separateVirialConstraining)
                     {
                         upd.update_for_constraint_virial(*ir,
                                                          md->homenr,
@@ -1880,8 +1896,8 @@ void gmx::LegacySimulator::do_md()
                                           step,
                                           state,
                                           upd.xp()->arrayRefWithPadding(),
-                                          &dvdl_constr,
-                                          bCalcVir && !simulationWork.useMts,
+                                          separateVirialConstraining ? nullptr : &dvdl_constr,
+                                          bCalcVir && !separateVirialConstraining,
                                           shake_vir);
 
                     upd.update_sd_second_half(*ir,
@@ -1916,9 +1932,13 @@ void gmx::LegacySimulator::do_md()
             if (mdGraph->graphIsCapturingThisStep())
             {
                 mdGraph->endRecord();
-                // Force graph reinstantiation (instead of graph exec update) with PME tuning,
-                // since the GPU kernels chosen by the FFT library can vary with grid size
-                bool forceGraphReinstantiation = pme_loadbal_is_active(pme_loadbal);
+                // Force graph reinstantiation (instead of graph exec
+                // update): with PME tuning, since the GPU kernels
+                // chosen by the FFT library can vary with grid size;
+                // or with an odd nstlist, since the odd/even step
+                // pruning pattern will change
+                bool forceGraphReinstantiation =
+                        pme_loadbal_is_active(pme_loadbal) || ((ir->nstlist % 2) == 1);
                 mdGraph->createExecutableGraph(forceGraphReinstantiation);
             }
             if (mdGraph->useGraphThisStep())