-
Notifications
You must be signed in to change notification settings - Fork 1
/
jacobikernel.cu
118 lines (91 loc) · 4.07 KB
/
jacobikernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "GpuPoissonSolver.h"
__global__
void jacobiIter(char* matrix, int* col_idx, int* row_ptr, float* unk_vect, float* rhs_vect, int matrix_dim, int matrix_size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= matrix_dim) return;
int n = idx + 1 < matrix_dim ? row_ptr[idx + 1] : matrix_size;
float sum = 0.f;
for (int i = row_ptr[idx]; i < n; i++) {
int j = col_idx[i];
if (idx != j) {
sum += matrix[i] * unk_vect[j];
}
}
unk_vect[idx] = (rhs_vect[idx] - sum) / -4.f;
}
__global__
void fitRange(float* red, float* green, float* blue, int len)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= len) return;
if (red[idx] < 0.f) red[idx] = 0.f;
if (red[idx] > 255.f) red[idx] = 255.f;
if (green[idx] < 0.f) green[idx] = 0.f;
if (green[idx] > 255.f) green[idx] = 255.f;
if (blue[idx] < 0.f) blue[idx] = 0.f;
if (blue[idx] > 255.f) blue[idx] = 255.f;
}
void gpuJacobiSolver(char* h_matrix,
int* h_col_idx,
int* h_row_ptr,
float* h_unk_vect_red,
float* h_unk_vect_green,
float* h_unk_vect_blue,
float* h_rhs_vect_red,
float* h_rhs_vect_green,
float* h_rhs_vect_blue,
int matrix_dim,
int matrix_size,
int iters)
{
char* d_matrix;
int* d_col_idx;
int* d_row_ptr;
float* d_unk_vect_red;
float* d_unk_vect_green;
float* d_unk_vect_blue;
float* d_rhs_vect_red;
float* d_rhs_vect_green;
float* d_rhs_vect_blue;
cudaMalloc((void**) &d_matrix, matrix_size * sizeof(char));
cudaMalloc((void**) &d_col_idx, matrix_size * sizeof(int));
cudaMalloc((void**) &d_row_ptr, matrix_dim * sizeof(int));
cudaMalloc((void**) &d_unk_vect_red, matrix_dim * sizeof(float));
cudaMalloc((void**) &d_unk_vect_green, matrix_dim * sizeof(float));
cudaMalloc((void**) &d_unk_vect_blue, matrix_dim * sizeof(float));
cudaMalloc((void**) &d_rhs_vect_red, matrix_dim * sizeof(float));
cudaMalloc((void**) &d_rhs_vect_green, matrix_dim * sizeof(float));
cudaMalloc((void**) &d_rhs_vect_blue, matrix_dim * sizeof(float));
cudaMemcpy(d_matrix, h_matrix, matrix_size * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_col_idx, h_col_idx, matrix_size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_row_ptr, h_row_ptr, matrix_dim * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_unk_vect_red, h_unk_vect_red, matrix_dim * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_unk_vect_green, h_unk_vect_green, matrix_dim * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_unk_vect_blue, h_unk_vect_blue, matrix_dim * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_rhs_vect_red, h_rhs_vect_red, matrix_dim * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_rhs_vect_green, h_rhs_vect_green, matrix_dim * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_rhs_vect_blue, h_rhs_vect_blue, matrix_dim * sizeof(float), cudaMemcpyHostToDevice);
int threadsPerBlock = 128;
int blocks = (matrix_dim / threadsPerBlock) + !!(matrix_dim % threadsPerBlock);
for (int i = 0; i < iters; i++) {
jacobiIter<<<blocks, threadsPerBlock>>>(d_matrix, d_col_idx, d_row_ptr, d_unk_vect_red, d_rhs_vect_red, matrix_dim, matrix_size);
jacobiIter<<<blocks, threadsPerBlock>>>(d_matrix, d_col_idx, d_row_ptr, d_unk_vect_green, d_rhs_vect_green, matrix_dim, matrix_size);
jacobiIter<<<blocks, threadsPerBlock>>>(d_matrix, d_col_idx, d_row_ptr, d_unk_vect_blue, d_rhs_vect_blue, matrix_dim, matrix_size);
}
fitRange<<<blocks, threadsPerBlock>>>(d_unk_vect_red, d_unk_vect_green, d_unk_vect_blue, matrix_dim);
cudaMemcpy(h_unk_vect_red, d_unk_vect_red, matrix_dim * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_unk_vect_green, d_unk_vect_green, matrix_dim * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_unk_vect_blue, d_unk_vect_blue, matrix_dim * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_matrix);
cudaFree(d_col_idx);
cudaFree(d_row_ptr);
cudaFree(d_unk_vect_red);
cudaFree(d_unk_vect_green);
cudaFree(d_unk_vect_blue);
cudaFree(d_rhs_vect_red);
cudaFree(d_rhs_vect_green);
cudaFree(d_rhs_vect_blue);
}