diff --git a/rvv_saxpy.c b/rvv_saxpy.c index 6b4302570..a4de0f0e1 100644 --- a/rvv_saxpy.c +++ b/rvv_saxpy.c @@ -44,7 +44,7 @@ float output[N] = { 0.2484350696132857}; void saxpy_golden(size_t n, const float a, const float *x, float *y) { - for (size_t i; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { y[i] = a * x[i] + y[i]; } } @@ -55,11 +55,11 @@ void saxpy_vec(size_t n, const float a, const float *x, float *y) { vfloat32m8_t vx, vy; for (; (l = vsetvl_e32m8(n)) > 0; n -= l) { - vx = vle_v_f32m8(x); + vx = vle32_v_f32m8(x); x += l; - vy = vle_v_f32m8(y); + vy = vle32_v_f32m8(y); vy = vfmacc_vf_f32m8(vy, a, vx); - vse_v_f32m8 (y, vy); + vse32_v_f32m8 (y, vy); y += l; } } diff --git a/rvv_sgemm.c b/rvv_sgemm.c index 975ba9927..2a8831f05 100644 --- a/rvv_sgemm.c +++ b/rvv_sgemm.c @@ -53,9 +53,13 @@ void sgemm_golden() { for (size_t i = 0; i < MLEN; ++i) for (size_t j = 0; j < NLEN; ++j) for (size_t k = 0; k < KLEN; ++k) - c_array[i * NLEN + j] += a_array[i * KLEN + k] * b_array[j + k * NLEN]; + golden_array[i * NLEN + j] += a_array[i * KLEN + k] * b_array[j + k * NLEN]; } + +// reference https://github.com/riscv/riscv-v-spec/blob/master/example/sgemm.S +// c += a*b (alpha=1, no transpose on input matrices) +// matrices stored in C row-major order void sgemm_vec(size_t size_m, size_t size_n, size_t size_k, const float *a, // m * k matrix size_t lda, @@ -63,25 +67,23 @@ void sgemm_vec(size_t size_m, size_t size_n, size_t size_k, size_t ldb, float *c, // m * n matrix size_t ldc) { - int i, j, k; size_t vl; - vfloat32m1_t vec_c; - for (int i = 0; i < size_m; ++i) { - j = size_n; - const float *bnp = b; - float *cnp = c; - for (; vl = vsetvl_e32m1(j); j -= vl) { - const float *akp = a; - const float *bkp = bnp; - vec_c = *(vfloat32m1_t *)cnp; - for (k = 0; k < size_k; ++k) { - vec_c = vfmacc_vf_f32m1(vec_c, *akp, *(vfloat32m1_t *)bkp); - bkp += ldb; - akp++; + for (int m = 0; m < size_m; ++m) { + const float *b_n_ptr = b; + float *c_n_ptr = c; + for (int c_n_count = size_n; (vl = vsetvl_e32m1(c_n_count )); c_n_count -= vl) { + const float *a_k_ptr = a; + const float *b_k_ptr = b_n_ptr; + vfloat32m1_t acc = vle32_v_f32m1(c_n_ptr); + for (size_t k = 0; k < size_k; ++k) { + vfloat32m1_t b_n_data = vle32_v_f32m1(b_k_ptr); + acc = vfmacc_vf_f32m1(acc, *a_k_ptr, b_n_data); + b_k_ptr += ldb; + a_k_ptr++; } - *(vfloat32m1_t *)cnp = vec_c; - cnp += vl; - bnp += vl; + vse32_v_f32m1(c_n_ptr, acc); + c_n_ptr += vl; + b_n_ptr += vl; } a += lda; c += ldc; @@ -98,18 +100,17 @@ int fp_eq(float reference, float actual, float relErr) int main() { // golden memcpy(golden_array, b_array, OUTPUT_LEN * sizeof(float)); - sgemm_golden(MLEN, NLEN, KLEN, a_array, KLEN, b_array, NLEN, golden_array, NLEN); + sgemm_golden(); // vector memcpy(c_array, b_array, OUTPUT_LEN * sizeof(float)); sgemm_vec(MLEN, NLEN, KLEN, a_array, KLEN, b_array, NLEN, c_array, NLEN); int pass = 1; for (int i = 0; i < OUTPUT_LEN; i++) { - if (!fp_eq(golden_array[i], c_array[i], 1e-6)) { - printf("failed, %f=!%f\n", golden_array[i], c_array[i]); + if (!fp_eq(golden_array[i], c_array[i], 1e-5)) { + printf("index %d failed, %f=!%f\n", i, golden_array[i], c_array[i]); pass = 0; } - printf("%f,%f\n",golden_array[i], c_array[i]); } if (pass) printf("passed\n");