Skip to content

Commit

Permalink
experimental/offload: Fix CI
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Jan 3, 2024
1 parent 37c9f4f commit c6e41a4
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ occamy-full-vsim:
# Questa
occamy-mcast-vsim:
variables:
RISCV_CFLAGS: "-DUSE_MULTICAST -DOFFLOAD_AXPY"
RISCV_CFLAGS: "-DUSE_MULTICAST -DOFFLOAD_AXPY -DN_CLUSTERS_TO_USE=1"
DATA_CFG: "$(PWD)/sw/device/apps/blas/gemm/params.hjson"
script:
- cd target/sim
Expand Down
14 changes: 8 additions & 6 deletions target/sim/sw/device/apps/experimental_offload/src/gemm_job.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
#define XSSR
#include "gemm.h"

void matmul(uint32_t M, uint32_t N, uint32_t K, double* A, double* B, double* C) {
void matmul(uint32_t M, uint32_t N, uint32_t K, double* A, double* B,
double* C) {
const uint32_t compute_num = snrt_cluster_compute_core_num();
const uint32_t compute_id = snrt_cluster_core_idx();

Expand All @@ -28,15 +29,15 @@ void matmul(uint32_t M, uint32_t N, uint32_t K, double* A, double* B, double* C)
const uint32_t ssr0_b[3] = {K, N, m};
const uint32_t ssr0_i[3] = {sizeof(double), 0, sizeof(double) * stride_a};

snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_b[2],
ssr0_i[0], ssr0_i[1], ssr0_i[2]);
snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_b[2], ssr0_i[0],
ssr0_i[1], ssr0_i[2]);

// Second matrix is stored in transposed format
const uint32_t ssr1_b[3] = {K, N, m};
const uint32_t ssr1_i[3] = {8, 8 * stride_b, 0};

snrt_ssr_loop_3d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2],
ssr1_i[0], ssr1_i[1], ssr1_i[2]);
snrt_ssr_loop_3d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_i[0],
ssr1_i[1], ssr1_i[2]);

// SSR start address need to be configured each time
snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, a);
Expand All @@ -50,7 +51,8 @@ void matmul(uint32_t M, uint32_t N, uint32_t K, double* A, double* B, double* C)
asm volatile(
"frep.o %[n_frep], 1, 0, 0 \n"
"fmadd.d %[accum], ft0, ft1, %[accum] \n"
: [ accum ] "+f"(accum) : [ n_frep ] "r"(K - 1)
: [ accum ] "+f"(accum)
: [ n_frep ] "r"(K - 1)
: "ft0", "ft1", "ft2");

// Store results back
Expand Down
48 changes: 27 additions & 21 deletions target/sim/sw/device/apps/experimental_offload/src/montecarlo_job.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ __thread uint32_t seed0, seed1, Ap, Cp;

inline void mc_init() {
// Double the sequences as each core produces two random numbers
unsigned int num_sequences = 2 * snrt_cluster_compute_core_num() * N_CLUSTERS_TO_USE;
init_2d_lcg_params(num_sequences, 0, LCG_A, LCG_C, &seed0, &seed1, &Ap, &Cp);
unsigned int num_sequences =
2 * snrt_cluster_compute_core_num() * N_CLUSTERS_TO_USE;
init_2d_lcg_params(num_sequences, 0, LCG_A, LCG_C, &seed0, &seed1, &Ap,
&Cp);
}

void mc_job_dm_core(job_t* job) {
Expand All @@ -20,7 +22,7 @@ void mc_job_dm_core(job_t* job) {
mc_job_t* mc_job = (mc_job_t*)l1_job_ptr;
#endif

snrt_mcycle(); // Retrieve job information (get job arguments)
snrt_mcycle(); // Retrieve job information (get job arguments)

#ifndef MULTICAST
// Copy job info (cluster 0 already has the data, no need to copy)
Expand All @@ -31,7 +33,7 @@ void mc_job_dm_core(job_t* job) {
}
#endif

snrt_mcycle(); // Retrieve job operands
snrt_mcycle(); // Retrieve job operands

#ifndef MULTICAST
// Synchronize with compute cores before updating the l1 alloc pointer
Expand All @@ -44,13 +46,13 @@ void mc_job_dm_core(job_t* job) {
void* next = (void*)((uint32_t)(mc_job) + sizeof(mc_job_t));
snrt_l1_update_next(next);

snrt_mcycle(); // Barrier
snrt_mcycle(); // Barrier

// Synchronize with compute cores to make sure the
// Synchronize with compute cores to make sure the
// L1 pointer is up to date before they can start computing
snrt_cluster_hw_barrier();

snrt_mcycle(); // Job execution
snrt_mcycle(); // Job execution

// Intra-cluster barrier
snrt_cluster_hw_barrier();
Expand All @@ -64,22 +66,24 @@ void mc_job_compute_core(job_t* job) {

// Get args
uint32_t n_samples = mc_job->args.n_samples;
double* result_ptr = (double *)(mc_job->args.result_ptr);
double* result_ptr = (double*)(mc_job->args.result_ptr);

snrt_mcycle();

// Synchronize with DM core to make sure the
// Synchronize with DM core to make sure the
// L1 pointer is up to date before they can start computing
snrt_cluster_hw_barrier();

snrt_mcycle();

// Get addresses of partial sum arrays
uint32_t* reduction_array = (uint32_t*) snrt_l1_next();
uint32_t* global_reduction_array = reduction_array + snrt_cluster_compute_core_num();
uint32_t* reduction_array = (uint32_t*)snrt_l1_next();
uint32_t* global_reduction_array =
reduction_array + snrt_cluster_compute_core_num();

// Run core-local kernel
reduction_array[snrt_cluster_core_idx()] = calculate_partial_sum(seed0, seed1, Ap, Cp, n_samples);
reduction_array[snrt_cluster_core_idx()] =
calculate_partial_sum(seed0, seed1, Ap, Cp, n_samples);

snrt_mcycle();

Expand All @@ -97,26 +101,28 @@ void mc_job_compute_core(job_t* job) {
sum += reduction_array[i];
}

snrt_mcycle(); // Exchange partial sums
snrt_mcycle(); // Exchange partial sums

if (snrt_cluster_idx() != 0) {
// Calculate address of cluster 0's reduction array
global_reduction_array = (uint32_t *)(((uint32_t)global_reduction_array) - snrt_cluster_idx() * cluster_offset);
global_reduction_array =
(uint32_t*)(((uint32_t)global_reduction_array) -
snrt_cluster_idx() * cluster_offset);

// Store partial sum to cluster 0's reduction array
global_reduction_array[snrt_cluster_idx()] = sum;

// Inter-cluster barrier
uint32_t barrier_ptr = (uint32_t)(&ct_barrier_cnt);
barrier_ptr -= cluster_offset * snrt_cluster_idx();
uint32_t cnt = __atomic_add_fetch((volatile uint32_t*)barrier_ptr, 1, __ATOMIC_RELAXED);
uint32_t cnt = __atomic_add_fetch((volatile uint32_t*)barrier_ptr,
1, __ATOMIC_RELAXED);

// Send interrupt if last to arrive on barrier
if (cnt == (N_CLUSTERS_TO_USE - 1)) {

// Reset inter-cluster barrier counter
*((volatile uint32_t*)barrier_ptr) = 0;

// Send interrupt to cluster 0
*(cluster_clint_set_ptr(0)) = 1;
}
Expand All @@ -128,20 +134,20 @@ void mc_job_compute_core(job_t* job) {
snrt_int_clr_mcip_unsafe();
}

snrt_mcycle(); // Inter-cluster reduction
snrt_mcycle(); // Inter-cluster reduction

for (int i = 1; i < N_CLUSTERS_TO_USE; i++) {
sum += global_reduction_array[i];
}

// Calculate PI
*result_ptr = estimate_pi(sum, n_samples * N_CLUSTERS_TO_USE * snrt_cluster_compute_core_num());
*result_ptr = estimate_pi(sum, n_samples * N_CLUSTERS_TO_USE *
snrt_cluster_compute_core_num());
snrt_fpu_fence();
}
}

snrt_mcycle();

if (snrt_global_core_idx() == 0)
set_host_sw_interrupt();
if (snrt_global_core_idx() == 0) set_host_sw_interrupt();
}
3 changes: 2 additions & 1 deletion target/sim/sw/device/runtime/src/occamy_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,6 @@ inline void return_to_cva6(sync_t sync) {
}

inline void return_to_cva6_accelerated(uint8_t offload_id) {
*((volatile uint32_t*)(CLINT_BASE_ADDR + CLINT_RETURN_TO_CVA6_REG_OFFSET)) = 1 << offload_id;
*((volatile uint32_t*)(CLINT_BASE_ADDR + CLINT_RETURN_TO_CVA6_REG_OFFSET)) =
1 << offload_id;
}
2 changes: 2 additions & 0 deletions target/sim/sw/host/apps/experimental_offload/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#
# Luca Colagrande <colluca@iis.ee.ethz.ch>

# flake8: noqa

import argparse
import pandas as pd
import numpy as np
Expand Down
12 changes: 5 additions & 7 deletions target/sim/sw/host/apps/experimental_offload/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#
# Luca Colagrande <colluca@iis.ee.ethz.ch>

import argparse
import os
from pathlib import Path
import signal
Expand All @@ -15,7 +14,7 @@
from termcolor import cprint, colored

sys.path.append(str(Path(__file__).parent / '../../../../../../deps/snitch_cluster/util/sim/'))
import sim_utils
import sim_utils # noqa: E402
from Simulator import QuestaSimulator # noqa: E402

FILE_DIR = Path(__file__).parent.resolve()
Expand Down Expand Up @@ -80,7 +79,6 @@ def build_hw(tests, dry_run=False):


def post_process_traces(test, dry_run=False):
app = test['app']
n_clusters_to_use = test['n_clusters_to_use']
logdir = test['run_dir'] / 'logs'
device_elf = test['device_elf']
Expand All @@ -101,7 +99,7 @@ def post_process_traces(test, dry_run=False):

# Get tests from a test list file
def get_tests(testlist, run_dir, hw_cfg):

# Get tests from test list file
testlist_path = Path(testlist).absolute()
with open(testlist_path, 'r') as f:
Expand Down Expand Up @@ -129,12 +127,12 @@ def get_tests(testlist, run_dir, hw_cfg):
sim_bin = TARGET_DIR / BIN_DIR / full_hw_cfg / 'occamy_top.vsim'
cflags = f'-DN_CLUSTERS_TO_USE={n_clusters_to_use}'
if multicast:
cflags += f' -DMULTICAST'
cflags += ' -DMULTICAST'
if app == 'axpy':
cflags += f' -DOFFLOAD_AXPY'
cflags += ' -DOFFLOAD_AXPY'
layout = 'layout.csv'
elif app == 'gemm':
cflags += f' -DOFFLOAD_GEMM'
cflags += ' -DOFFLOAD_GEMM'
layout = 'layout.csv'
elif app == 'mc':
cflags += f' -DOFFLOAD_MONTECARLO -DMC_LENGTH={length}'
Expand Down
15 changes: 9 additions & 6 deletions target/sim/sw/host/apps/experimental_offload/src/offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
// SPDX-License-Identifier: Apache-2.0

#include "offload.h"
#include <math.h>
#include <stddef.h>
#include "host.c"
#include <math.h>

#if defined(OFFLOAD_AXPY)
#include "axpy/data/data.h"
Expand All @@ -31,7 +31,7 @@ usr_data_t usr_data __attribute__((section(".nc_spm")));
double pi __attribute__((section(".wide_spm")));

static inline void send_job_and_wakeup(job_t *job, uint64_t l1_job_ptr) {
*((volatile uint32_t*)(CLINT_BASE_ADDR + CLINT_OFFLOAD0_REG_OFFSET)) =
*((volatile uint32_t *)(CLINT_BASE_ADDR + CLINT_OFFLOAD0_REG_OFFSET)) =
n_clusters_to_use;

switch (job->id) {
Expand Down Expand Up @@ -138,9 +138,12 @@ int main() {
WIDE_SPM_ADDR((uint64_t)y), WIDE_SPM_ADDR((uint64_t)z)};
job_t axpy = {J_AXPY, 0, axpy_args};
#elif defined(OFFLOAD_GEMM)
gemm_args_t gemm_args = {
M / n_clusters_to_use, N, K, WIDE_SPM_ADDR((uint64_t)a),
WIDE_SPM_ADDR((uint64_t)b), WIDE_SPM_ADDR((uint64_t)c)};
gemm_args_t gemm_args = {M / n_clusters_to_use,
N,
K,
WIDE_SPM_ADDR((uint64_t)a),
WIDE_SPM_ADDR((uint64_t)b),
WIDE_SPM_ADDR((uint64_t)c)};
job_args_t job_args;
job_args.gemm = gemm_args;
job_t gemm = {J_GEMM, 0, job_args};
Expand Down Expand Up @@ -216,7 +219,7 @@ int main() {
sys_dma_blk_memcpy((uint64_t)c, WIDE_SPM_ADDR((uint64_t)c),
M * N * sizeof(double));
#elif defined(OFFLOAD_MONTECARLO)
double pi_estimate = *((double*)mc_args.result_ptr);
double pi_estimate = *((double *)mc_args.result_ptr);
double err = fabs(pi_estimate - 3.14);
if (err > 0.5) return 1;
#endif
Expand Down

0 comments on commit c6e41a4

Please sign in to comment.