-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cu
155 lines (127 loc) · 5.21 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/******************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "kernel.cu"
#include "support.h"
int main (int argc, char *argv[])
{
Timer timer;
cudaError_t cuda_ret;
time_t t;
// Initialize random number generator
srand((unsigned) time(&t));
// Initialize host variables
printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);
float *A_h, *B_h, *C_h;
float *A_d, *B_d, *C_d;
size_t A_sz, B_sz, C_sz;
unsigned matArow, matAcol;
unsigned matBrow, matBcol;
dim3 dim_grid, dim_block;
if (argc == 1)
{
matArow = 1000;
matAcol = matBrow = 1000;
matBcol = 1000;
}
else if (argc == 2)
{
matArow = atoi(argv[1]);
matAcol = matBrow = atoi(argv[1]);
matBcol = atoi(argv[1]);
}
else if (argc == 4)
{
matArow = atoi(argv[1]);
matAcol = matBrow = atoi(argv[2]);
matBcol = atoi(argv[3]);
}
else
{
printf("\n Invalid input parameters!"
"\n Usage: ./sgemm # All matrices are 1000 x 1000"
"\n Usage: ./sgemm <m> # All matrices are m x m"
"\n Usage: ./sgemm <m> <k> <n> # A: m x k, B: k x n, C: m x n"
"\n");
exit(0);
}
A_sz = matArow * matAcol;
B_sz = matBrow * matBcol;
C_sz = matArow * matBcol;
A_h = (float*) malloc( sizeof(float)*A_sz );
for (unsigned int i=0; i < A_sz; i++) { A_h[i] = (rand()%100)/100.00; }
B_h = (float*) malloc( sizeof(float)*B_sz );
for (unsigned int i=0; i < B_sz; i++) { B_h[i] = (rand()%100)/100.00; }
C_h = (float*) malloc( sizeof(float)*C_sz );
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf(" A: %u x %u\n B: %u x %u\n C: %u x %u\n", matArow, matAcol,
matBrow, matBcol, matArow, matBcol);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
//INSERT CODE HERE
cuda_ret = cudaMalloc((void**)&A_d, sizeof(float)*A_sz);
if(cuda_ret != cudaSuccess){FATAL("Unable to allocate A_d device memory");}
cuda_ret = cudaMalloc((void**)&B_d, sizeof(float)*B_sz);
if(cuda_ret != cudaSuccess){FATAL("Unable to allocate B_d device memory");}
cuda_ret = cudaMalloc((void**)&C_d, sizeof(float)*C_sz);
if(cuda_ret != cudaSuccess){FATAL("Unable to allocate C_d device memory");}
//END
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess){FATAL("Error synchronizing CUDA device");}
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
//INSERT CODE HERE
cuda_ret = cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess){FATAL("Unable to copy A_h memory to device");}
cuda_ret = cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess){FATAL("Unable to copy B_h memory to device");}
// END
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess){FATAL("Error synchronizing CUDA device");}
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel using standard sgemm interface ---------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
basicSgemm('N', 'N', matArow, matBcol, matBrow, 1.0f, \
A_d, matArow, B_d, matBrow, 0.0f, C_d, matBrow);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess){FATAL("Unable to launch kernel");}
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
//INSERT CODE HERE
cuda_ret = cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost);
if(cuda_ret != cudaSuccess){FATAL("Error copying src (C_d) to dst (C_h) on Host device");}
// END
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess){FATAL("Error synchronizing CUDA device");}
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
verify(A_h, B_h, C_h, matArow, matAcol, matBcol);
// Free memory ------------------------------------------------------------
free(A_h);
free(B_h);
free(C_h);
//INSERT CODE HERE
cuda_ret = cudaFree(A_d);
if(cuda_ret != cudaSuccess){FATAL("Error freeing A_d");}
cuda_ret = cudaFree(B_d);
if(cuda_ret != cudaSuccess){FATAL("Error freeing B_d");}
cuda_ret = cudaFree(C_d);
if(cuda_ret != cudaSuccess){FATAL("Error freeing C_d");}
//DONE
return 0;
}