/
test3D_r2c.c
370 lines (301 loc) · 11.3 KB
/
test3D_r2c.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
/*
This program exemplifies using P3DFFT++ library.
! This program initializes a 3D array with a 3D sine wave, then
! performs 3D forward Fourier transform, then backward transform,
! and checks that
! the results are correct, namely the same as in the start except
! for a normalization factor. It can be used both as a correctness
! test and for timing the library functions.
!
! The program expects 'stdin' file in the working directory, with
! a single line of numbers : Nx,Ny,Nz,Ndim,Nrep. Here Nx,Ny,Nz
! are box dimensions, Ndim is the dimentionality of processor grid
! (1 or 2), and Nrep is the number of repititions. Optionally
! a file named 'dims' can also be provided to guide in the choice
! of processor geometry in case of 2D decomposition. It should contain
! two numbers in a line, with their product equal to the total number
! of tasks. Otherwise processor grid geometry is chosen automatically.
! For better performance, experiment with this setting, varying
! iproc and jproc. In many cases, minimizing iproc gives best results.
! Setting it to 1 corresponds to one-dimensional decomposition.
*/
#include "p3dfft.h"
//#include "defs.h"
#include <math.h>
#include <stdio.h>
void init_wave(double *,int[3],int *,int[3]);
void print_res(double *,int *,int *,int *, int *);
void normalize(double *,long int,int *);
double check_res(double*,double *,int *, int *);
main(int argc,char **argv)
{
int N=64;
int Nrep = 1;
int myid,nprocs;
int gdims[3],gdims2[3];
int pgrid1[3],pgrid2[3];
int proc_order[3];
int mem_order[3];
int mem_order2[] = {2,1,0};
int i,j,k,x,y,z,p1,p2;
double Nglob;
int imo1[3];
int *ldims,*ldims2;
long int size1,size2;
double *IN;
Grid *grid1,*grid2;
int *glob_start,*glob_start2;
double *OUT;
int type_ids1[3];
int type_ids2[3];
Type3D type_rcc,type_ccr;
double t=0.;
double *FIN;
double mydiff;
double diff = 0.0;
double gtavg=0.;
double gtmin=INFINITY;
double gtmax = 0.;
int pdims[2],nx,ny,nz,n,ndim;
Plan3D trans_f,trans_b;
FILE *fp;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
if(myid == 0) {
printf("P3DFFT++ C test program. Running on %d cores\n",nprocs);
if((fp=fopen("stdin", "r"))==NULL){
printf("Cannot open file. Setting to default nx=ny=nz=128, ndim=2, n=1.\n");
nx=ny=nz=128; Nrep=1;ndim=2;
} else {
fscanf(fp,"%d %d %d %d %d\n",&nx,&ny,&nz,&ndim,&Nrep);
fclose(fp);
}
printf("P3DFFT test, 3D wave input\n");
#ifndef SINGLE_PREC
printf("Double precision\n (%d %d %d) grid\n %d proc. dimensions\n%d repetitions\n",nx,ny,nz,ndim,Nrep);
#else
printf("Single precision\n (%d %d %d) grid\n %d proc. dimensions\n%d repetitions\n",nx,ny,nz,ndim,Nrep);
#endif
}
MPI_Bcast(&nx,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&ny,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&nz,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&Nrep,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&ndim,1,MPI_INT,0,MPI_COMM_WORLD);
//! Establish 2D processor grid decomposition, either by readin from file 'dims' or by an MPI default
if(ndim == 1) {
pdims[0] = 1; pdims[1] = nprocs;
if(myid == 0)
printf("Using one-dimensional decomposition\n");
}
else if(ndim == 2) {
if(myid == 0)
printf("Using two-dimensional decomposition\n");
fp = fopen("dims","r");
if(fp != NULL) {
if(myid == 0)
printf("Reading proc. grid from file dims\n");
fscanf(fp,"%d %d\n",pdims,pdims+1);
fclose(fp);
if(pdims[0]*pdims[1] != nprocs)
pdims[1] = nprocs / pdims[0];
}
else {
if(myid == 0)
printf("Creating proc. grid with mpi_dims_create\n");
pdims[0]=pdims[1]=0;
MPI_Dims_create(nprocs,2,pdims);
if(pdims[0] > pdims[1]) {
pdims[0] = pdims[1];
pdims[1] = nprocs/pdims[0];
}
}
}
if(myid == 0)
printf("Using processor grid %d x %d\n",pdims[0],pdims[1]);
// Set up work structures for P3DFFT
p3dfft_setup();
//Set up 2 transform types for 3D transforms
type_ids1[0] = P3DFFT_R2CFFT_D;
type_ids1[1] = P3DFFT_CFFT_FORWARD_D;
type_ids1[2] = P3DFFT_CFFT_FORWARD_D;
type_ids2[0] = P3DFFT_C2RFFT_D;
type_ids2[1] = P3DFFT_CFFT_BACKWARD_D;
type_ids2[2] = P3DFFT_CFFT_BACKWARD_D;
//Now initialize 3D transforms (forward and backward) with these types
type_rcc = p3dfft_init_3Dtype(type_ids1);
type_ccr = p3dfft_init_3Dtype(type_ids2);
//Set up global dimensions of the grid
gdims[0] = nx;
gdims[1] = ny;
gdims[2] = nz;
for(i=0; i < 3;i++) {
proc_order[i] = mem_order[i] = i; // The simplest case of sequential ordering
}
p1 = pdims[0];
p2 = pdims[1];
// Define the initial processor grid. In this case, it's a 2D pencil, with 1st dimension local and the 2nd and 3rd split by iproc and jproc tasks respectively
pgrid1[0] = 1;
pgrid1[1] = p1;
pgrid1[2] = p2;
//Define the final processor grid. It can be the same as initial grid, however here it is different. First, in real-to-complex and complex-to-real transforms the global grid dimensions change for example from (n0,n1,n2) to (n0/2+1,n1,n2), since most applications attempt to save memory by using the conjugate symmetry of the Fourier transform of real data. Secondly, the final grid may have different processor distribution and memory ordering, since for example many applications with convolution and those solving partial differential equations do not need the initial grid configuration in Fourier space. The flow of these applications is typically 1) transform from physical to Fourier space, 2) apply convolution or derivative calculation in Fourier space, and 3) inverse FFT to physical space. Since forward FFT's last step is 1D FFT in the third dimension, it is more efficient to leave this dimension local and stride-1, and since the first step of the inverse FFT is to start with the third dimension 1D FFT, this format naturally fits the algorithm and results in big savings of time due to elimination of several extra transposes.
pgrid2[0] =p1;
pgrid2[1] = p2;
pgrid2[2] = 1;
// Set up the final global grid dimensions (these will be different from the original dimensions in one dimension since we are doing real-to-complex transform)
for(i=0; i < 3;i++)
gdims2[i] = gdims[i];
gdims2[0] = gdims2[0]/2+1;
//Initialize initial and final grids, based on the above information
grid1 = p3dfft_init_grid(gdims,pgrid1,proc_order,mem_order,MPI_COMM_WORLD);
grid2 = p3dfft_init_grid(gdims2,pgrid2,proc_order,mem_order2,MPI_COMM_WORLD);
//Set up the forward transform, based on the predefined 3D transform type and grid1 and grid2. This is the planning stage, needed once as initialization.
trans_f = p3dfft_plan_3Dtrans(grid1,grid2,type_rcc,1);
//Now set up the backward transform
trans_b = p3dfft_plan_3Dtrans(grid2,grid1,type_ccr,1);
//Save the local array dimensions.
ldims = grid1->ldims;
size1 = ldims[0]*ldims[1]*ldims[2];
//Now allocate initial and final arrays in physical space as real-valued 1D storage containing a contiguous 3D local array
IN=(double *) malloc(sizeof(double)*size1);
FIN= (double *) malloc(sizeof(double) *size1);
//Initialize the IN array with a sine wave in 3D, based on the starting positions of my local grid within the global grid
glob_start = grid1->glob_start;
init_wave(IN,gdims,ldims,glob_start);
//Determine local array dimensions and allocate fourier space, complex-valued out array
ldims2 = grid2->ldims;
glob_start2 = grid2->glob_start;
size2 = ldims2[0]*ldims2[1]*ldims2[2];
OUT=(double *) malloc(sizeof(double) *size2 *2);
// Warm-up run, forward transform
p3dfft_exec_3Dtrans_double(trans_f,IN,OUT,0);
Nglob = gdims[0]*gdims[1]*gdims[2];
// timing loop
for(i=0; i < Nrep;i++) {
t -= MPI_Wtime();
p3dfft_exec_3Dtrans_double(trans_f,IN,OUT,0); // Forward real-to-complex 3D FFT
t += MPI_Wtime();
MPI_Barrier(MPI_COMM_WORLD);
if(myid == 0)
printf("Results of forward transform: \n");
print_res(OUT,gdims,ldims2,glob_start2,mem_order2);
normalize(OUT,size2,gdims);
t -= MPI_Wtime();
p3dfft_exec_3Dtrans_double(trans_b,OUT,FIN,0); // Backward (inverse) complex-to-real 3D FFT
t += MPI_Wtime();
}
mydiff = check_res(IN,FIN,ldims,mem_order);
diff = 0.;
MPI_Reduce(&mydiff,&diff,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
if(myid == 0) {
if(diff > 1.0e-14 * Nglob *0.25)
printf("Results are incorrect\n");
else
printf("Results are correct\n");
printf("Max. diff. =%lg\n",diff);
}
MPI_Reduce(&t,>avg,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
MPI_Reduce(&t,>min,1,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);
MPI_Reduce(&t,>max,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
if(myid == 0)
printf("Transform time (avg/min/max): %lf %lf %lf\n",gtavg/(nprocs*Nrep),gtmin/Nrep,gtmax/Nrep);
free(IN); free(OUT); free(FIN);
// Clean up grid structures
p3dfft_free_grid(grid1);
p3dfft_free_grid(grid2);
// Clean up all P3DFFT++ data
p3dfft_cleanup();
MPI_Finalize();
}
void normalize(double *A,long int size,int *gdims)
{
long int i;
double f = 1.0/(((double) gdims[0])*((double) gdims[1])*((double) gdims[2]));
for(i=0;i<size*2;i++)
A[i] = A[i] * f;
}
void init_wave(double *IN,int *gdims,int *ldims,int *gstart)
{
double *sinx,*siny,*sinz,sinyz,*p;
int x,y,z;
double twopi = atan(1.0)*8.0;
sinx = (double *) malloc(sizeof(double)*gdims[0]);
siny = (double *) malloc(sizeof(double)*gdims[1]);
sinz = (double *) malloc(sizeof(double)*gdims[2]);
for(z=0;z < ldims[2];z++)
sinz[z] = sin((z+gstart[2])*twopi/gdims[2]);
for(y=0;y < ldims[1];y++)
siny[y] = sin((y+gstart[1])*twopi/gdims[1]);
for(x=0;x < ldims[0];x++)
sinx[x] = sin((x+gstart[0])*twopi/gdims[0]);
p = IN;
for(z=0;z < ldims[2];z++)
for(y=0;y < ldims[1];y++) {
sinyz = siny[y]*sinz[z];
for(x=0;x < ldims[0];x++)
*p++ = sinx[x]*sinyz;
}
free(sinx); free(siny); free(sinz);
}
void print_res(double *A,int *gdims,int *ldims,int *gstart, int *mo)
{
int x,y,z;
double *p;
double Nglob;
int imo[3],i,j;
for(i=0;i<3;i++)
for(j=0;j<3;j++)
if(mo[i] == j)
imo[j] = i;
Nglob = gdims[0]*gdims[1]*gdims[2];
p = A;
for(z=0;z < ldims[imo[2]];z++)
for(y=0;y < ldims[imo[1]];y++)
for(x=0;x < ldims[imo[0]];x++) {
if(fabs(*p) > Nglob *1.25e-4 || fabs(*(p+1)) > Nglob *1.25e-4)
printf("(%d %d %d) %lg %lg\n",x+gstart[imo[0]],y+gstart[imo[1]],z+gstart[imo[2]],*p,*(p+1));
p+=2;
}
}
double check_res(double *A,double *B,int *ldims, int *mo)
{
int x,y,z;
double *p1,*p2,mydiff;
int imo[3],i,j;
p1 = A;
p2 = B;
for(i=0;i<3;i++)
for(j=0;j<3;j++)
if(mo[i] == j)
imo[j] = i;
mydiff = 0.;
for(z=0;z < ldims[imo[2]];z++)
for(y=0;y < ldims[imo[1]];y++)
for(x=0;x < ldims[imo[0]];x++) {
if(fabs(*p1 - *p2) > mydiff)
mydiff = fabs(*p1-*p2);
p1++;
p2++;
}
return(mydiff);
}
void write_buf(double *buf,char *label,int sz[3],int mo[3], int taskid) {
int i,j,k;
FILE *fp;
char str[80],filename[80];
double *p= buf;
strcpy(filename,label);
sprintf(str,".%d",taskid);
strcat(filename,str);
fp=fopen(filename,"w");
for(k=0;k<sz[mo[2]];k++)
for(j=0;j<sz[mo[1]];j++)
for(i=0;i<sz[mo[0]];i++) {
if(abs(*p) > 1.e-7) {
fprintf(fp,"(%d %d %d) %lg\n",i,j,k,*p);
}
p++;
}
fclose(fp);
}