diff --git a/cscs-checks/mch/automatic_arrays.py b/cscs-checks/mch/automatic_arrays.py
new file mode 100644
index 0000000000..7fab45dc49
--- /dev/null
+++ b/cscs-checks/mch/automatic_arrays.py
@@ -0,0 +1,65 @@
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.simple_test
+class AutomaticArraysCheck(rfm.RegressionTest):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
+        self.valid_prog_environs = ['PrgEnv-cray*', 'PrgEnv-pgi*',
+                                    'PrgEnv-gnu']
+        if self.current_system.name in ['daint', 'dom']:
+            self.modules = ['craype-accel-nvidia60']
+            self._pgi_flags = '-acc -ta=tesla:cc60 -Mnorpath'
+            self._cray_variables = {}
+        elif self.current_system.name in ['kesch']:
+            self.modules = ['craype-accel-nvidia35']
+            self._pgi_flags = '-O2 -ta=tesla,cc35,cuda8.0'
+            self._cray_variables = {'MV2_USE_CUDA': '1'}
+
+        self.num_tasks = 1
+        self.num_gpus_per_node = 1
+        self.num_tasks_per_node = 1
+        self.sourcepath = 'automatic_arrays.f90'
+        self.sanity_patterns = sn.assert_found(r'Result: ', self.stdout)
+        self.perf_patterns = {
+            'perf': sn.extractsingle(r'Timing:\s+(?P<perf>\S+)',
+                                     self.stdout, 'perf', float)
+        }
+
+        self.arrays_reference = {
+            'PrgEnv-cray': {
+                'daint:gpu': {'perf': (5.7E-05, None, 0.15)},
+                'dom:gpu': {'perf': (5.8E-05, None, 0.15)},
+                'kesch:cn': {'perf': (2.9E-04, None, 0.15)},
+            },
+            'PrgEnv-gnu': {
+                'daint:gpu': {'perf': (7.0E-03, None, 0.15)},
+                'dom:gpu': {'perf': (7.3E-03, None, 0.15)},
+                'kesch:cn': {'perf': (6.5E-03, None, 0.15)},
+            },
+            'PrgEnv-pgi': {
+                'daint:gpu': {'perf': (6.4E-05, None, 0.15)},
+                'dom:gpu': {'perf': (6.3E-05, None, 0.15)},
+                'kesch:cn': {'perf': (1.4E-04, None, 0.15)},
+            }
+        }
+
+        self.maintainers = ['AJ', 'VK']
+        self.tags = {'production'}
+
+    def setup(self, partition, environ, **job_opts):
+        if environ.name.startswith('PrgEnv-cray'):
+            environ.fflags = '-O2 -hacc -hnoomp'
+            key = 'PrgEnv-cray'
+            self.variables = self._cray_variables
+        elif environ.name.startswith('PrgEnv-pgi'):
+            environ.fflags = self._pgi_flags
+            key = 'PrgEnv-pgi'
+        elif environ.name.startswith('PrgEnv-gnu'):
+            environ.fflags = '-O2'
+            key = 'PrgEnv-gnu'
+
+        self.reference = self.arrays_reference[key]
+        super().setup(partition, environ, **job_opts)
diff --git a/cscs-checks/mch/src/automatic_arrays.f90 b/cscs-checks/mch/src/automatic_arrays.f90
new file mode 100644
index 0000000000..926ab9e734
--- /dev/null
+++ b/cscs-checks/mch/src/automatic_arrays.f90
@@ -0,0 +1,554 @@
+! This code tests if the compilers implement a data pool for automatic arrays
+module setup
+  implicit none
+
+  integer, parameter :: nvec = 20000
+  integer, parameter :: niter = 10
+
+  real*8, parameter :: max_rel_diff = 0.15
+
+  public :: nvec, niter, max_rel_diff
+
+end module
+
+module workarrays
+  implicit none
+  real*8, allocatable :: a1(:), a2(:), a3(:), a4(:), a5(:), a6(:), a7(:),      &
+                         a8(:), a9(:)
+  real*8, allocatable :: aa1(:,:), aa2(:,:), aa3(:,:)
+  real*8, allocatable :: zparam(:)
+
+  public :: a1, a2, a3, a4, a5, a6, a7, a8, a9
+  public :: aa1, aa2, aa3
+  public :: zparam
+
+  public :: allocate_arrays, deallocate_arrays
+
+  contains
+
+  subroutine allocate_arrays(nvec, lacc)
+    integer, intent(in) :: nvec
+    logical, intent(in) :: lacc
+
+    allocate(a1(nvec), a2(nvec), a3(nvec), a4(nvec), a5(nvec), a6(nvec),        &
+             a7(nvec), a8(nvec), a9(nvec))
+    a1 = 0D0
+    a2 = 0D0
+    a3 = 0D0
+    a4 = 0D0
+    a5 = 0D0
+    a6 = 0D0
+    a7 = 0D0
+    a8 = 0D0
+    a9 = 0D0
+
+    allocate(aa1(nvec,5), aa2(nvec,5), aa3(nvec,5))
+    aa1 = 0D0
+    aa2 = 0D0
+    aa3 = 0D0
+
+    allocate(zparam(8))
+    zparam = 0D0
+
+    !$acc enter data copyin( a1, a2, a3, a4, a5, a6, a7, a8, a9,               &
+    !$acc                    aa1, aa2, aa3, zparam ) if( lacc )
+
+  end subroutine allocate_arrays
+
+  subroutine deallocate_arrays(lacc)
+    logical, intent(in) :: lacc
+
+    !$acc exit data delete( a1, a2, a3, a4, a5, a6, a7, a8, a9,                &
+    !$acc                    aa1, aa2, aa3, zparam ) if( lacc )
+
+    deallocate(a1, a2, a3, a4, a5, a6, a7, a8, a9)
+
+    deallocate(aa1, aa2, aa3)
+
+    deallocate(zparam)
+
+  end subroutine deallocate_arrays
+
+end module workarrays
+
+module computation
+  implicit none
+
+  PUBLIC :: cpu_workarrays, cpu_automatic, gpu_workarrays, gpu_automatic
+  contains
+
+  subroutine cpu_workarrays(nvec,a,b)
+    USE workarrays
+    integer, intent(in)   :: nvec
+    real*8, intent(inout) :: a(nvec)
+    real*8, intent(in)    :: b(nvec)
+
+    integer :: i, k, iparam, il
+
+    do iparam=1,8
+      zparam(iparam) = 0.1D0*iparam
+    end do
+
+    do i=1,nvec
+      a1(i) = 0.1D0*(1.0D0+1.9D0/i)
+      a2(i) = 0.2D0*(1.1D0+1.8D0/i)
+      a3(i) = 0.3D0*(1.2D0+1.7D0/i)
+      a4(i) = 0.4D0*(1.3D0+1.6D0/i)
+      a5(i) = 0.5D0*(1.4D0+1.5D0/i)
+      a6(i) = 0.6D0*(1.5D0+1.4D0/i)
+      a7(i) = 0.7D0*(1.6D0+1.3D0/i)
+      a8(i) = 0.8D0*(1.7D0+1.2D0/i)
+      a9(i) = 0.9D0*(1.8D0+1.1D0/i)
+
+      aa1(i,1) = 0.5D0*(1.0D0+(1.0D0-0.1D0)/i)
+      aa1(i,2) = 0.5D0*(1.0D0+(1.0D0-0.2D0)/i)
+      aa1(i,3) = 0.5D0*(1.0D0+(1.0D0-0.3D0)/i)
+      aa1(i,4) = 0.5D0*(1.0D0+(1.0D0-0.4D0)/i)
+      aa1(i,5) = 0.5D0*(1.0D0+(1.0D0-0.5D0)/i)
+
+      aa2(i,1) = 0.7D0*(1.0D0+(1.0D0+0.1D0)/i)
+      aa2(i,2) = 0.7D0*(1.0D0+(1.0D0+0.2D0)/i)
+      aa2(i,3) = 0.7D0*(1.0D0+(1.0D0+0.3D0)/i)
+      aa2(i,4) = 0.7D0*(1.0D0+(1.0D0+0.4D0)/i)
+      aa2(i,5) = 0.7D0*(1.0D0+(1.0D0+0.5D0)/i)
+
+      aa3(i,1) = 0.9D0*(1.0D0+(1.0D0-0.9D0)/i)
+      aa3(i,2) = 0.9D0*(1.0D0+(1.0D0-0.8D0)/i)
+      aa3(i,3) = 0.9D0*(1.0D0+(1.0D0-0.7D0)/i)
+      aa3(i,4) = 0.9D0*(1.0D0+(1.0D0-0.6D0)/i)
+      aa3(i,5) = 0.9D0*(1.0D0+(1.0D0-0.5D0)/i)
+    end do
+
+    do i=1,nvec
+      do iparam=1,8 ! to make more operations
+        a(i) = zparam(iparam)*(1.0D0+cos(a(i))) + b(i)*(1.0D0 + sin(1.0D0      &
+               +a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)+a7(i)+a8(i)+a9(i)          &
+               +aa1(i,1)+aa1(i,2)+aa1(i,3)+aa1(i,4)+aa1(i,5)                   &
+               +aa2(i,1)+aa2(i,2)+aa2(i,3)+aa2(i,4)+aa2(i,5)                   &
+               +aa3(i,1)+aa3(i,2)+aa3(i,3)+aa3(i,4)+aa3(i,5)))
+      end do
+    end do !i
+
+  end subroutine cpu_workarrays
+
+  subroutine cpu_automatic(nvec,a,b)
+    integer, intent(in)   :: nvec
+    real*8, intent(inout) :: a(nvec)
+    real*8, intent(in)    :: b(nvec)
+
+    integer :: i, k, iparam, il
+    real*8 :: a1(nvec), a2(nvec), a3(nvec), a4(nvec), a5(nvec), a6(nvec),      &
+              a7(nvec), a8(nvec), a9(nvec)
+    real*8 :: aa1(nvec,5), aa2(nvec,5), aa3(nvec,5)
+    real*8 :: zparam(8)
+
+    do iparam=1,8
+      zparam(iparam) = 0.1D0*iparam
+    end do
+
+    do i=1,nvec
+      a1(i) = 0.1D0*(1.0D0+1.9D0/i)
+      a2(i) = 0.2D0*(1.1D0+1.8D0/i)
+      a3(i) = 0.3D0*(1.2D0+1.7D0/i)
+      a4(i) = 0.4D0*(1.3D0+1.6D0/i)
+      a5(i) = 0.5D0*(1.4D0+1.5D0/i)
+      a6(i) = 0.6D0*(1.5D0+1.4D0/i)
+      a7(i) = 0.7D0*(1.6D0+1.3D0/i)
+      a8(i) = 0.8D0*(1.7D0+1.2D0/i)
+      a9(i) = 0.9D0*(1.8D0+1.1D0/i)
+
+      aa1(i,1) = 0.5D0*(1.0D0+(1.0D0-0.1D0)/i)
+      aa1(i,2) = 0.5D0*(1.0D0+(1.0D0-0.2D0)/i)
+      aa1(i,3) = 0.5D0*(1.0D0+(1.0D0-0.3D0)/i)
+      aa1(i,4) = 0.5D0*(1.0D0+(1.0D0-0.4D0)/i)
+      aa1(i,5) = 0.5D0*(1.0D0+(1.0D0-0.5D0)/i)
+
+      aa2(i,1) = 0.7D0*(1.0D0+(1.0D0+0.1D0)/i)
+      aa2(i,2) = 0.7D0*(1.0D0+(1.0D0+0.2D0)/i)
+      aa2(i,3) = 0.7D0*(1.0D0+(1.0D0+0.3D0)/i)
+      aa2(i,4) = 0.7D0*(1.0D0+(1.0D0+0.4D0)/i)
+      aa2(i,5) = 0.7D0*(1.0D0+(1.0D0+0.5D0)/i)
+
+      aa3(i,1) = 0.9D0*(1.0D0+(1.0D0-0.9D0)/i)
+      aa3(i,2) = 0.9D0*(1.0D0+(1.0D0-0.8D0)/i)
+      aa3(i,3) = 0.9D0*(1.0D0+(1.0D0-0.7D0)/i)
+      aa3(i,4) = 0.9D0*(1.0D0+(1.0D0-0.6D0)/i)
+      aa3(i,5) = 0.9D0*(1.0D0+(1.0D0-0.5D0)/i)
+    end do
+
+    do i=1,nvec
+      do iparam=1,8 ! to make more operations
+        a(i) = zparam(iparam)*(1.0D0+cos(a(i))) + b(i)*(1.0D0 + sin(1.0D0      &
+               +a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)+a7(i)+a8(i)+a9(i)          &
+               +aa1(i,1)+aa1(i,2)+aa1(i,3)+aa1(i,4)+aa1(i,5)                   &
+               +aa2(i,1)+aa2(i,2)+aa2(i,3)+aa2(i,4)+aa2(i,5)                   &
+               +aa3(i,1)+aa3(i,2)+aa3(i,3)+aa3(i,4)+aa3(i,5)))
+      end do
+    end do
+
+  end subroutine cpu_automatic
+
+  subroutine gpu_workarrays(nvec,a,b)
+    USE workarrays
+    integer, intent(in)   :: nvec
+    real*8, intent(inout) :: a(nvec)
+    real*8, intent(in)    :: b(nvec)
+
+    integer :: i, k, iparam, il
+
+    !$acc data present( a, b,                                                  &
+    !$acc               a1, a2, a3, a4, a5, a6, a7, a8, a9,                    &
+    !$acc               aa1, aa2, aa3,                                         &
+    !$acc               zparam )
+
+    !$acc parallel
+    !$acc loop gang vector
+    do iparam=1,8
+      zparam(iparam) = 0.1D0*iparam
+    end do
+    !$acc end parallel
+
+    !$acc parallel
+    !$acc loop gang vector
+    do i=1,nvec
+      a1(i) = 0.1D0*(1.0D0+1.9D0/i)
+      a2(i) = 0.2D0*(1.1D0+1.8D0/i)
+      a3(i) = 0.3D0*(1.2D0+1.7D0/i)
+      a4(i) = 0.4D0*(1.3D0+1.6D0/i)
+      a5(i) = 0.5D0*(1.4D0+1.5D0/i)
+      a6(i) = 0.6D0*(1.5D0+1.4D0/i)
+      a7(i) = 0.7D0*(1.6D0+1.3D0/i)
+      a8(i) = 0.8D0*(1.7D0+1.2D0/i)
+      a9(i) = 0.9D0*(1.8D0+1.1D0/i)
+
+      aa1(i,1) = 0.5D0*(1.0D0+(1.0D0-0.1D0)/i)
+      aa1(i,2) = 0.5D0*(1.0D0+(1.0D0-0.2D0)/i)
+      aa1(i,3) = 0.5D0*(1.0D0+(1.0D0-0.3D0)/i)
+      aa1(i,4) = 0.5D0*(1.0D0+(1.0D0-0.4D0)/i)
+      aa1(i,5) = 0.5D0*(1.0D0+(1.0D0-0.5D0)/i)
+
+      aa2(i,1) = 0.7D0*(1.0D0+(1.0D0+0.1D0)/i)
+      aa2(i,2) = 0.7D0*(1.0D0+(1.0D0+0.2D0)/i)
+      aa2(i,3) = 0.7D0*(1.0D0+(1.0D0+0.3D0)/i)
+      aa2(i,4) = 0.7D0*(1.0D0+(1.0D0+0.4D0)/i)
+      aa2(i,5) = 0.7D0*(1.0D0+(1.0D0+0.5D0)/i)
+
+      aa3(i,1) = 0.9D0*(1.0D0+(1.0D0-0.9D0)/i)
+      aa3(i,2) = 0.9D0*(1.0D0+(1.0D0-0.8D0)/i)
+      aa3(i,3) = 0.9D0*(1.0D0+(1.0D0-0.7D0)/i)
+      aa3(i,4) = 0.9D0*(1.0D0+(1.0D0-0.6D0)/i)
+      aa3(i,5) = 0.9D0*(1.0D0+(1.0D0-0.5D0)/i)
+    end do
+    !$acc end parallel
+
+    !$acc parallel
+    !$acc loop gang vector
+    do i=1,nvec
+      !$acc loop seq
+      do iparam=1,8 ! to make more operations
+        a(i) = zparam(iparam)*(1.0D0+cos(a(i))) + b(i)*(1.0D0 + sin(1.0D0      &
+               +a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)+a7(i)+a8(i)+a9(i)          &
+               +aa1(i,1)+aa1(i,2)+aa1(i,3)+aa1(i,4)+aa1(i,5)                   &
+               +aa2(i,1)+aa2(i,2)+aa2(i,3)+aa2(i,4)+aa2(i,5)                   &
+               +aa3(i,1)+aa3(i,2)+aa3(i,3)+aa3(i,4)+aa3(i,5)))
+      end do
+    end do
+    !$acc end parallel
+
+    !$acc end data
+
+  end subroutine gpu_workarrays
+
+  subroutine gpu_automatic(nvec,a,b)
+    integer, intent(in)   :: nvec
+    real*8, intent(inout) :: a(nvec)
+    real*8, intent(in)    :: b(nvec)
+
+    integer :: i, k, iparam, il
+    real*8 :: a1(nvec), a2(nvec), a3(nvec), a4(nvec), a5(nvec), a6(nvec),      &
+              a7(nvec), a8(nvec), a9(nvec)
+    real*8 :: aa1(nvec,5), aa2(nvec,5), aa3(nvec,5)
+    real*8 :: zparam(8)
+
+    !$acc data present( a, b )                                                 &
+    !$acc      create( a1, a2, a3, a4, a5, a6, a7, a8, a9,                     &
+    !$acc              aa1, aa2, aa3,                                          &
+    !$acc              zparam )
+
+    !$acc parallel
+    !$acc loop gang vector
+    do iparam=1,8
+      zparam(iparam) = 0.1D0*iparam
+    end do
+    !$acc end parallel
+
+    !$acc parallel
+    !$acc loop gang vector
+    do i=1,nvec
+      a1(i) = 0.1D0*(1.0D0+1.9D0/i)
+      a2(i) = 0.2D0*(1.1D0+1.8D0/i)
+      a3(i) = 0.3D0*(1.2D0+1.7D0/i)
+      a4(i) = 0.4D0*(1.3D0+1.6D0/i)
+      a5(i) = 0.5D0*(1.4D0+1.5D0/i)
+      a6(i) = 0.6D0*(1.5D0+1.4D0/i)
+      a7(i) = 0.7D0*(1.6D0+1.3D0/i)
+      a8(i) = 0.8D0*(1.7D0+1.2D0/i)
+      a9(i) = 0.9D0*(1.8D0+1.1D0/i)
+
+      aa1(i,1) = 0.5D0*(1.0D0+(1.0D0-0.1D0)/i)
+      aa1(i,2) = 0.5D0*(1.0D0+(1.0D0-0.2D0)/i)
+      aa1(i,3) = 0.5D0*(1.0D0+(1.0D0-0.3D0)/i)
+      aa1(i,4) = 0.5D0*(1.0D0+(1.0D0-0.4D0)/i)
+      aa1(i,5) = 0.5D0*(1.0D0+(1.0D0-0.5D0)/i)
+
+      aa2(i,1) = 0.7D0*(1.0D0+(1.0D0+0.1D0)/i)
+      aa2(i,2) = 0.7D0*(1.0D0+(1.0D0+0.2D0)/i)
+      aa2(i,3) = 0.7D0*(1.0D0+(1.0D0+0.3D0)/i)
+      aa2(i,4) = 0.7D0*(1.0D0+(1.0D0+0.4D0)/i)
+      aa2(i,5) = 0.7D0*(1.0D0+(1.0D0+0.5D0)/i)
+
+      aa3(i,1) = 0.9D0*(1.0D0+(1.0D0-0.9D0)/i)
+      aa3(i,2) = 0.9D0*(1.0D0+(1.0D0-0.8D0)/i)
+      aa3(i,3) = 0.9D0*(1.0D0+(1.0D0-0.7D0)/i)
+      aa3(i,4) = 0.9D0*(1.0D0+(1.0D0-0.6D0)/i)
+      aa3(i,5) = 0.9D0*(1.0D0+(1.0D0-0.5D0)/i)
+    end do
+    !$acc end parallel
+
+    !$acc parallel
+    !$acc loop gang vector
+    do i=1,nvec
+      !$acc loop seq
+      do iparam=1,8 ! to make more operations
+        a(i) = zparam(iparam)*(1.0D0+cos(a(i))) + b(i)*(1.0D0 + sin(1.0D0      &
+               +a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)+a7(i)+a8(i)+a9(i)          &
+               +aa1(i,1)+aa1(i,2)+aa1(i,3)+aa1(i,4)+aa1(i,5)                   &
+               +aa2(i,1)+aa2(i,2)+aa2(i,3)+aa2(i,4)+aa2(i,5)                   &
+               +aa3(i,1)+aa3(i,2)+aa3(i,3)+aa3(i,4)+aa3(i,5)))
+      end do
+    end do
+    !$acc end parallel
+
+    !$acc end data
+
+  end subroutine gpu_automatic
+
+end module computation
+
+program AutomaticArrays
+  use setup, only: nvec, niter, max_rel_diff
+  use computation
+  use workarrays, only: allocate_arrays, deallocate_arrays
+  implicit none
+  include 'mpif.h'
+
+  real*8, allocatable :: a(:), b(:), ref_a(:)
+
+  integer :: ierr
+  integer :: mpi_size, mpi_rank
+  integer :: nt, i
+  real*8 :: walltime(4,niter)
+  real*8 :: error(4)
+
+  logical :: validated, success_cpu, success_gpu
+  real*8 :: mean_time(4), benchmark_cpu, benchmark_gpu
+
+  call MPI_Init(ierr)
+  call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr)
+  call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr)
+
+  !----------------------------------------------------------------------------!
+  ! CPU reference with work arrays
+
+  allocate(a(nvec), b(nvec), ref_a(nvec))
+  call allocate_arrays(nvec, .false.)
+
+  do nt=1,niter
+    do i=1,nvec
+      a(i) = 0.0D0
+      b(i) = 0.1D0
+    end do
+
+    walltime(1,nt) = MPI_WTIME()
+    call cpu_workarrays(nvec,a,b)
+    walltime(1,nt) = MPI_WTIME() - walltime(1,nt)
+  end do
+
+  ref_a = a
+  error(1) = sum(ref_a - a)
+
+  call deallocate_arrays(.false.)
+  deallocate(a, b)
+
+  !----------------------------------------------------------------------------!
+  ! CPU reference with automatic arrays
+
+  allocate(a(nvec), b(nvec))
+
+  do nt=1,niter
+    do i=1,nvec
+      a(i) = 0.0D0
+      b(i) = 0.1D0
+    end do
+
+    walltime(2,nt) = MPI_WTIME()
+    call cpu_automatic(nvec,a,b)
+    walltime(2,nt) = MPI_WTIME() - walltime(2,nt)
+  end do
+
+  error(2) = sum(ref_a - a)
+
+  deallocate(a, b)
+
+  !----------------------------------------------------------------------------!
+  ! GPU dummy calculcations
+  allocate(a(nvec), b(nvec))
+  !$acc data create(a, b)
+  !$acc parallel
+  !$acc loop gang vector
+  do i=1,nvec
+    a(i) = 1.3*i**0.5
+    b(i) = 0.3*i**0.75
+  end do
+  !$acc end parallel
+
+  !$acc parallel
+  !$acc loop seq
+  do nt=1, 10
+    !$acc loop gang vector
+    do i=1,nvec
+      b(i) = a(i)*a(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc end data
+  deallocate(a, b)
+
+  !----------------------------------------------------------------------------!
+  ! GPU OpenACC with work arrays
+
+  allocate(a(nvec), b(nvec))
+  call allocate_arrays(nvec, .true.)
+
+  !$acc data create( a, b )
+
+  do nt=1,niter
+    !$acc parallel
+    !$acc loop gang vector
+    do i=1,nvec
+      a(i) = 0.0D0
+      b(i) = 0.1D0
+    end do
+    !$acc end parallel
+
+    !$acc wait
+    walltime(3,nt) = MPI_WTIME()
+    call gpu_workarrays(nvec,a,b)
+    !$acc wait
+    walltime(3,nt) = MPI_WTIME() - walltime(3,nt)
+  end do
+
+  !$acc update host( a )
+
+  !$acc end data
+
+  error(3) = sum(ref_a - a)
+
+  call deallocate_arrays(.true.)
+  deallocate(a, b)
+
+  !----------------------------------------------------------------------------!
+  ! GPU OpenACC with automatic arrays
+
+  allocate(a(nvec), b(nvec))
+
+  !$acc data create( a, b )
+
+  do nt=1,niter
+    !$acc parallel
+    !$acc loop gang vector
+    do i=1,nvec
+      a(i) = 0.0D0
+      b(i) = 0.1D0
+    end do
+    !$acc end parallel
+
+    !$acc wait
+    walltime(4,nt) = MPI_WTIME()
+    call gpu_automatic(nvec,a,b)
+    !$acc wait
+    walltime(4,nt) = MPI_WTIME() - walltime(4,nt)
+  end do
+
+  !$acc update host( a )
+
+  !$acc end data
+
+  error(4) = sum(ref_a - a)
+
+  deallocate(a, b)
+
+  !----------------------------------------------------------------------------!
+  ! Check results
+
+  ! Check if calculations are correct
+  if (any(error > 1e-10)) then
+    validated = .false.
+    write (*,*) "Calculations did not validate and timings are meaningless."
+    write (*,*) "Absolute errors: ", error
+  else
+    validated = .true.
+  end if
+
+  ! Check timings
+  if (validated) then
+    ! Check CPU timing
+    mean_time(1) = sum(walltime(1, 2:))/(niter - 1)
+    mean_time(2) = sum(walltime(2, 2:))/(niter - 1)
+    benchmark_cpu = (mean_time(2) - mean_time(1))/mean_time(2)
+    if (benchmark_cpu > max_rel_diff) then
+      success_cpu = .false.
+      write (*,'(A)') "Compiler doesn't implement data pool for CPU!"
+      write (*,'(A,F5.1,A,F5.1,A)') "Relative difference is too large: ",      &
+                                     benchmark_cpu*100, "% > ", max_rel_diff*100, "%"
+    else
+      success_cpu = .true.
+    end if
+    if (.not. success_cpu) then
+      write (*,'(A,ES9.3,A)') "CPU work arrays timing: ", mean_time(1), " s"
+      write (*,'(A,ES9.3,A)') "CPU automatic arrays timing: ", mean_time(2), " s"
+      write (*,'(A,F5.1,A)') "CPU automatic arrays relative timing: ", benchmark_cpu*100, "%"
+      write (*,*) ""
+    end if
+
+    ! Check GPU timing
+    mean_time(3) = sum(walltime(3, 2:))/(niter - 1)
+    mean_time(4) = sum(walltime(4, 2:))/(niter - 1)
+    benchmark_gpu = (mean_time(4) - mean_time(3))/mean_time(4)
+    if (benchmark_gpu > max_rel_diff) then
+      success_gpu = .false.
+      write (*,'(A)') "Compiler doesn't implement data pool for GPU!"
+      write (*,'(A,F5.1,A,F5.1,A)') "Relative difference is too large: ",      &
+                                     benchmark_gpu*100, "% > ", max_rel_diff*100, "%"
+    else
+      success_gpu = .true.
+    end if
+    if (.not. success_gpu) then
+      write (*,'(A,ES9.3,A)') "GPU work arrays timing: ", mean_time(3), " s"
+      write (*,'(A,ES9.3,A)') "GPU automatic arrays timing: ", mean_time(4), " s"
+      write (*,'(A,F5.1,A)') "GPU automatic arrays relative timing: ", benchmark_gpu*100, "%"
+      write (*,*) ""
+    end if
+  end if
+
+  write (*,'(A,ES9.3,A)') "Timing: ", mean_time(4), " s"
+  if (success_gpu) then
+    write (*,'(A)') "Result: OK"
+  else
+    write (*,'(A)') "Result: FAIL"
+  end if
+
+  call MPI_Finalize(ierr)
+
+end program AutomaticArrays