experimental/offload: Fix CI

pulp-platform · Jan 3, 2024 · c6e41a4 · c6e41a4
1 parent 37c9f4f
commit c6e41a4
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 42 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -73,7 +73,7 @@ occamy-full-vsim:
 # Questa
 occamy-mcast-vsim:
   variables:
-    RISCV_CFLAGS: "-DUSE_MULTICAST -DOFFLOAD_AXPY"
+    RISCV_CFLAGS: "-DUSE_MULTICAST -DOFFLOAD_AXPY -DN_CLUSTERS_TO_USE=1"
     DATA_CFG: "$(PWD)/sw/device/apps/blas/gemm/params.hjson"
   script:
     - cd target/sim

diff --git a/target/sim/sw/device/apps/experimental_offload/src/gemm_job.h b/target/sim/sw/device/apps/experimental_offload/src/gemm_job.h
@@ -5,7 +5,8 @@
 #define XSSR
 #include "gemm.h"
 
-void matmul(uint32_t M, uint32_t N, uint32_t K, double* A, double* B, double* C) {
+void matmul(uint32_t M, uint32_t N, uint32_t K, double* A, double* B,
+            double* C) {
     const uint32_t compute_num = snrt_cluster_compute_core_num();
     const uint32_t compute_id = snrt_cluster_core_idx();
 
@@ -28,15 +29,15 @@ void matmul(uint32_t M, uint32_t N, uint32_t K, double* A, double* B, double* C)
     const uint32_t ssr0_b[3] = {K, N, m};
     const uint32_t ssr0_i[3] = {sizeof(double), 0, sizeof(double) * stride_a};
 
-    snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_b[2],
-                        ssr0_i[0], ssr0_i[1], ssr0_i[2]);
+    snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_b[2], ssr0_i[0],
+                     ssr0_i[1], ssr0_i[2]);
 
     // Second matrix is stored in transposed format
     const uint32_t ssr1_b[3] = {K, N, m};
     const uint32_t ssr1_i[3] = {8, 8 * stride_b, 0};
 
-    snrt_ssr_loop_3d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2],
-                        ssr1_i[0], ssr1_i[1], ssr1_i[2]);
+    snrt_ssr_loop_3d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_i[0],
+                     ssr1_i[1], ssr1_i[2]);
 
     // SSR start address need to be configured each time
     snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, a);
@@ -50,7 +51,8 @@ void matmul(uint32_t M, uint32_t N, uint32_t K, double* A, double* B, double* C)
             asm volatile(
                 "frep.o %[n_frep], 1, 0, 0 \n"
                 "fmadd.d %[accum], ft0, ft1, %[accum] \n"
-                : [ accum ] "+f"(accum) : [ n_frep ] "r"(K - 1)
+                : [ accum ] "+f"(accum)
+                : [ n_frep ] "r"(K - 1)
                 : "ft0", "ft1", "ft2");
 
             // Store results back

diff --git a/target/sim/sw/device/apps/experimental_offload/src/montecarlo_job.h b/target/sim/sw/device/apps/experimental_offload/src/montecarlo_job.h
@@ -9,8 +9,10 @@ __thread uint32_t seed0, seed1, Ap, Cp;
 
 inline void mc_init() {
     // Double the sequences as each core produces two random numbers
-    unsigned int num_sequences = 2 * snrt_cluster_compute_core_num() * N_CLUSTERS_TO_USE;
-    init_2d_lcg_params(num_sequences, 0, LCG_A, LCG_C, &seed0, &seed1, &Ap, &Cp);
+    unsigned int num_sequences =
+        2 * snrt_cluster_compute_core_num() * N_CLUSTERS_TO_USE;
+    init_2d_lcg_params(num_sequences, 0, LCG_A, LCG_C, &seed0, &seed1, &Ap,
+                       &Cp);
 }
 
 void mc_job_dm_core(job_t* job) {
@@ -20,7 +22,7 @@ void mc_job_dm_core(job_t* job) {
     mc_job_t* mc_job = (mc_job_t*)l1_job_ptr;
 #endif
 
-    snrt_mcycle(); // Retrieve job information (get job arguments)
+    snrt_mcycle();  // Retrieve job information (get job arguments)
 
 #ifndef MULTICAST
     // Copy job info (cluster 0 already has the data, no need to copy)
@@ -31,7 +33,7 @@ void mc_job_dm_core(job_t* job) {
     }
 #endif
 
-    snrt_mcycle(); // Retrieve job operands
+    snrt_mcycle();  // Retrieve job operands
 
 #ifndef MULTICAST
     // Synchronize with compute cores before updating the l1 alloc pointer
@@ -44,13 +46,13 @@ void mc_job_dm_core(job_t* job) {
     void* next = (void*)((uint32_t)(mc_job) + sizeof(mc_job_t));
     snrt_l1_update_next(next);
 
-    snrt_mcycle(); // Barrier
+    snrt_mcycle();  // Barrier
 
-    // Synchronize with compute cores to make sure the  
+    // Synchronize with compute cores to make sure the
     // L1 pointer is up to date before they can start computing
     snrt_cluster_hw_barrier();
 
-    snrt_mcycle(); // Job execution
+    snrt_mcycle();  // Job execution
 
     // Intra-cluster barrier
     snrt_cluster_hw_barrier();
@@ -64,22 +66,24 @@ void mc_job_compute_core(job_t* job) {
 
     // Get args
     uint32_t n_samples = mc_job->args.n_samples;
-    double* result_ptr = (double *)(mc_job->args.result_ptr);
+    double* result_ptr = (double*)(mc_job->args.result_ptr);
 
     snrt_mcycle();
 
-    // Synchronize with DM core to make sure the  
+    // Synchronize with DM core to make sure the
     // L1 pointer is up to date before they can start computing
     snrt_cluster_hw_barrier();
 
     snrt_mcycle();
 
     // Get addresses of partial sum arrays
-    uint32_t* reduction_array = (uint32_t*) snrt_l1_next();
-    uint32_t* global_reduction_array = reduction_array + snrt_cluster_compute_core_num();
+    uint32_t* reduction_array = (uint32_t*)snrt_l1_next();
+    uint32_t* global_reduction_array =
+        reduction_array + snrt_cluster_compute_core_num();
 
     // Run core-local kernel
-    reduction_array[snrt_cluster_core_idx()] = calculate_partial_sum(seed0, seed1, Ap, Cp, n_samples);
+    reduction_array[snrt_cluster_core_idx()] =
+        calculate_partial_sum(seed0, seed1, Ap, Cp, n_samples);
 
     snrt_mcycle();
 
@@ -97,26 +101,28 @@ void mc_job_compute_core(job_t* job) {
             sum += reduction_array[i];
         }
 
-        snrt_mcycle(); // Exchange partial sums
+        snrt_mcycle();  // Exchange partial sums
 
         if (snrt_cluster_idx() != 0) {
             // Calculate address of cluster 0's reduction array
-            global_reduction_array = (uint32_t *)(((uint32_t)global_reduction_array) - snrt_cluster_idx() * cluster_offset);
+            global_reduction_array =
+                (uint32_t*)(((uint32_t)global_reduction_array) -
+                            snrt_cluster_idx() * cluster_offset);
 
             // Store partial sum to cluster 0's reduction array
             global_reduction_array[snrt_cluster_idx()] = sum;
 
             // Inter-cluster barrier
             uint32_t barrier_ptr = (uint32_t)(&ct_barrier_cnt);
             barrier_ptr -= cluster_offset * snrt_cluster_idx();
-            uint32_t cnt = __atomic_add_fetch((volatile uint32_t*)barrier_ptr, 1, __ATOMIC_RELAXED);
+            uint32_t cnt = __atomic_add_fetch((volatile uint32_t*)barrier_ptr,
+                                              1, __ATOMIC_RELAXED);
 
             // Send interrupt if last to arrive on barrier
             if (cnt == (N_CLUSTERS_TO_USE - 1)) {
-
                 // Reset inter-cluster barrier counter
                 *((volatile uint32_t*)barrier_ptr) = 0;
-            
+
                 // Send interrupt to cluster 0
                 *(cluster_clint_set_ptr(0)) = 1;
             }
@@ -128,20 +134,20 @@ void mc_job_compute_core(job_t* job) {
                 snrt_int_clr_mcip_unsafe();
             }
 
-            snrt_mcycle(); // Inter-cluster reduction
+            snrt_mcycle();  // Inter-cluster reduction
 
             for (int i = 1; i < N_CLUSTERS_TO_USE; i++) {
                 sum += global_reduction_array[i];
             }
 
             // Calculate PI
-            *result_ptr = estimate_pi(sum, n_samples * N_CLUSTERS_TO_USE * snrt_cluster_compute_core_num());
+            *result_ptr = estimate_pi(sum, n_samples * N_CLUSTERS_TO_USE *
+                                               snrt_cluster_compute_core_num());
             snrt_fpu_fence();
         }
     }
 
     snrt_mcycle();
 
-    if (snrt_global_core_idx() == 0)
-        set_host_sw_interrupt();
+    if (snrt_global_core_idx() == 0) set_host_sw_interrupt();
 }
diff --git a/target/sim/sw/device/runtime/src/occamy_device.h b/target/sim/sw/device/runtime/src/occamy_device.h
@@ -67,5 +67,6 @@ inline void return_to_cva6(sync_t sync) {
 }
 
 inline void return_to_cva6_accelerated(uint8_t offload_id) {
-    *((volatile uint32_t*)(CLINT_BASE_ADDR + CLINT_RETURN_TO_CVA6_REG_OFFSET)) = 1 << offload_id;
+    *((volatile uint32_t*)(CLINT_BASE_ADDR + CLINT_RETURN_TO_CVA6_REG_OFFSET)) =
+        1 << offload_id;
 }
diff --git a/target/sim/sw/host/apps/experimental_offload/plot.py b/target/sim/sw/host/apps/experimental_offload/plot.py
@@ -5,6 +5,8 @@
 #
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
+# flake8: noqa
+
 import argparse
 import pandas as pd
 import numpy as np

diff --git a/target/sim/sw/host/apps/experimental_offload/run.py b/target/sim/sw/host/apps/experimental_offload/run.py
@@ -5,7 +5,6 @@
 #
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-import argparse
 import os
 from pathlib import Path
 import signal
@@ -15,7 +14,7 @@
 from termcolor import cprint, colored
 
 sys.path.append(str(Path(__file__).parent / '../../../../../../deps/snitch_cluster/util/sim/'))
-import sim_utils
+import sim_utils  # noqa: E402
 from Simulator import QuestaSimulator  # noqa: E402
 
 FILE_DIR = Path(__file__).parent.resolve()
@@ -80,7 +79,6 @@ def build_hw(tests, dry_run=False):
 
 
 def post_process_traces(test, dry_run=False):
-    app = test['app']
     n_clusters_to_use = test['n_clusters_to_use']
     logdir = test['run_dir'] / 'logs'
     device_elf = test['device_elf']
@@ -101,7 +99,7 @@ def post_process_traces(test, dry_run=False):
 
 # Get tests from a test list file
 def get_tests(testlist, run_dir, hw_cfg):
-    
+
     # Get tests from test list file
     testlist_path = Path(testlist).absolute()
     with open(testlist_path, 'r') as f:
@@ -129,12 +127,12 @@ def get_tests(testlist, run_dir, hw_cfg):
         sim_bin = TARGET_DIR / BIN_DIR / full_hw_cfg / 'occamy_top.vsim'
         cflags = f'-DN_CLUSTERS_TO_USE={n_clusters_to_use}'
         if multicast:
-            cflags += f' -DMULTICAST'
+            cflags += ' -DMULTICAST'
         if app == 'axpy':
-            cflags += f' -DOFFLOAD_AXPY'
+            cflags += ' -DOFFLOAD_AXPY'
             layout = 'layout.csv'
         elif app == 'gemm':
-            cflags += f' -DOFFLOAD_GEMM'
+            cflags += ' -DOFFLOAD_GEMM'
             layout = 'layout.csv'
         elif app == 'mc':
             cflags += f' -DOFFLOAD_MONTECARLO -DMC_LENGTH={length}'

diff --git a/target/sim/sw/host/apps/experimental_offload/src/offload.c b/target/sim/sw/host/apps/experimental_offload/src/offload.c
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "offload.h"
+#include <math.h>
 #include <stddef.h>
 #include "host.c"
-#include <math.h>
 
 #if defined(OFFLOAD_AXPY)
 #include "axpy/data/data.h"
@@ -31,7 +31,7 @@ usr_data_t usr_data __attribute__((section(".nc_spm")));
 double pi __attribute__((section(".wide_spm")));
 
 static inline void send_job_and_wakeup(job_t *job, uint64_t l1_job_ptr) {
-    *((volatile uint32_t*)(CLINT_BASE_ADDR + CLINT_OFFLOAD0_REG_OFFSET)) =
+    *((volatile uint32_t *)(CLINT_BASE_ADDR + CLINT_OFFLOAD0_REG_OFFSET)) =
         n_clusters_to_use;
 
     switch (job->id) {
@@ -138,9 +138,12 @@ int main() {
         WIDE_SPM_ADDR((uint64_t)y), WIDE_SPM_ADDR((uint64_t)z)};
     job_t axpy = {J_AXPY, 0, axpy_args};
 #elif defined(OFFLOAD_GEMM)
-    gemm_args_t gemm_args = {
-        M / n_clusters_to_use, N, K, WIDE_SPM_ADDR((uint64_t)a),
-        WIDE_SPM_ADDR((uint64_t)b), WIDE_SPM_ADDR((uint64_t)c)};
+    gemm_args_t gemm_args = {M / n_clusters_to_use,
+                             N,
+                             K,
+                             WIDE_SPM_ADDR((uint64_t)a),
+                             WIDE_SPM_ADDR((uint64_t)b),
+                             WIDE_SPM_ADDR((uint64_t)c)};
     job_args_t job_args;
     job_args.gemm = gemm_args;
     job_t gemm = {J_GEMM, 0, job_args};
@@ -216,7 +219,7 @@ int main() {
     sys_dma_blk_memcpy((uint64_t)c, WIDE_SPM_ADDR((uint64_t)c),
                        M * N * sizeof(double));
 #elif defined(OFFLOAD_MONTECARLO)
-    double pi_estimate = *((double*)mc_args.result_ptr);
+    double pi_estimate = *((double *)mc_args.result_ptr);
     double err = fabs(pi_estimate - 3.14);
     if (err > 0.5) return 1;
 #endif