diff --git a/target/sim/Makefile b/target/sim/Makefile
index 25075f7f..2e15a2d2 100644
--- a/target/sim/Makefile
+++ b/target/sim/Makefile
@@ -575,7 +575,7 @@ clean-work:
 	rm -rf work
 
 clean-bender:
-	rm -rf Bender.lock .bender/ work/
+	rm -rf $(ROOT)/Bender.lock $(ROOT)/.bender/ $(ROOT)/deps
 
 clean-logs:
 	rm -rf $(LOGS_DIR)/
diff --git a/target/sim/sw/device/Makefile b/target/sim/sw/device/Makefile
index 4aba3474..89f4f5a3 100644
--- a/target/sim/sw/device/Makefile
+++ b/target/sim/sw/device/Makefile
@@ -5,7 +5,7 @@
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 # Add user applications to APPS variable
-APPS = offload
+APPS = offload sndnn/gemm axpy
 
 TARGET ?= all
 
@@ -22,5 +22,8 @@ runtime:
 	$(MAKE) -C $@ $(TARGET)
 
 # Explicit dependency of apps on runtime
-$(APP_SUBDIRS): runtime
+$(APP_SUBDIRS): libraries/snDNN runtime
+	$(MAKE) -C $@ $(TARGET)
+
+libraries/snDNN: runtime
 	$(MAKE) -C $@ $(TARGET)
diff --git a/target/sim/sw/device/apps/Makefile b/target/sim/sw/device/apps/Makefile
new file mode 100644
index 00000000..115d1ccb
--- /dev/null
+++ b/target/sim/sw/device/apps/Makefile
@@ -0,0 +1,28 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+SUBDIRS  = offload
+# SUBDIRS += gemm
+SUBDIRS += axpy
+# SUBDIRS += nop
+# SUBDIRS += blas/axpy
+# SUBDIRS += blas/gemm
+# SUBDIRS += sndnn/batchnorm
+# # SUBDIRS += sndnn/conv2d # fails with exit code 32
+# SUBDIRS += sndnn/fusedconv
+# SUBDIRS += sndnn/gelu
+SUBDIRS += sndnn/gemm
+# # SUBDIRS += sndnn/layernorm # throws illegal instruction in simulation
+# SUBDIRS += sndnn/linear
+# SUBDIRS += sndnn/maxpool
+# # SUBDIRS += sndnn/softmax
+
+.PHONY: all clean $(SUBDIRS)
+
+all: $(SUBDIRS)
+
+$(SUBDIRS):
+	$(MAKE) -C $@ $(TARGET)
diff --git a/target/sim/sw/device/apps/common.mk b/target/sim/sw/device/apps/common.mk
index 4c4e5ab0..f379ea05 100644
--- a/target/sim/sw/device/apps/common.mk
+++ b/target/sim/sw/device/apps/common.mk
@@ -4,18 +4,48 @@
 #
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-include ../../toolchain.mk
+# Usage of absolute paths is required to externally include
+# this Makefile from multiple different locations
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+include $(MK_DIR)/../toolchain.mk
 
 ###################
 # Build variables #
 ###################
 
+# Fixed paths in repository tree
+ROOT        = $(abspath $(MK_DIR)/../../../../..)
 # Directories
-BUILDDIR    = $(abspath build)
-APPSDIR     = $(abspath ../)
-RUNTIME_DIR = $(abspath ../../runtime)
+APPSDIR     = $(abspath $(MK_DIR)/)
+# RUNTIME_DIR = $(abspath ../../runtime)
+RUNTIME_DIR = $(abspath $(MK_DIR)/../runtime)
 SNRT_DIR    = $(shell bender path snitch_cluster)/sw/snRuntime
-SW_DIR      = $(abspath ../../../)
+# SW_DIR      = $(abspath ../../../)
+SW_DIR      = $(abspath $(MK_DIR)/../../)
+
+#################
+# SNDNN_LIBRARY #
+#################
+ifdef USE_SNDNN_LIBRARY
+SNDNN_DIR    := $(shell bender path snitch_cluster)/sw/snDNN
+SNDNN_LIB_DIR := $(abspath $(MK_DIR)/../libraries/snDNN)
+SNDNN_LIB_NAME = snDNN
+
+# Dependencies
+INCDIRS += $(SNDNN_LIB_DIR)/src
+INCDIRS += $(SNDNN_DIR)/src
+INCDIRS += $(SNDNN_DIR)/include
+# Linker script
+# RISCV_LDFLAGS += -L$(abspath $(SNDNN_LIB_DIR))
+# Link snRuntime library
+RISCV_LDFLAGS += -L$(abspath $(SNDNN_LIB_DIR)/build/)
+RISCV_LDFLAGS += -l$(SNDNN_LIB_NAME)
+BUILDDIR    = $(abspath $(MK_DIR)/sndnn/$(APP)/build)
+SNDNN_LIB   = $(realpath $(SNDNN_LIB_DIR)/build/lib$(SNDNN_LIB_NAME).a)
+LD_SRCS    += $(SNDNN_LIB)
+else
+BUILDDIR    = $(abspath $(MK_DIR)/$(APP)/build)
+endif
 
 # Dependencies
 INCDIRS += $(RUNTIME_DIR)/src
@@ -83,7 +113,7 @@ $(DEP): $(SRCS) | $(BUILDDIR)
 	$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(ELF)' $< > $@
 
 $(ELF): $(DEP) $(LD_SRCS) | $(BUILDDIR)
-	$(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@
+	$(RISCV_CC) $(RISCV_CFLAGS) $(SRCS) $(RISCV_LDFLAGS) -o $@
 
 $(BIN): $(ELF) | $(BUILDDIR)
 	$(RISCV_OBJCOPY) $(OBJCOPY_FLAGS) $< $@
diff --git a/target/sim/sw/device/apps/sndnn/Makefile b/target/sim/sw/device/apps/sndnn/Makefile
new file mode 100644
index 00000000..337cf884
--- /dev/null
+++ b/target/sim/sw/device/apps/sndnn/Makefile
@@ -0,0 +1,34 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Gianna Paulin <pauling@iis.ee.ethz.ch>
+
+# Usage of absolute paths is required to externally include this Makefile
+MK_DIR        := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+DATA_DIR      := $(realpath $(MK_DIR)/data)
+SRC_DIR       := $(realpath $(MK_DIR)/$(APP_NAME)/src)
+SNDNN_DIR     := $(shell bender path snitch_cluster)/sw/snDNN/)
+SNDNN_LIB_DIR := $(realpath $(MK_DIR)/../../libraries/snDNN/)
+# SNDNN_SRC_DIR := $(shell bender path snitch_cluster)/sw/snDNN/src)
+
+INCLUDE_DIR  = $(realpath $(SNDNN_DIR)/include)
+INCLUDE_DIR += $(realpath $(SNDNN_DIR)/src)
+INCLUDE_DIR += $(realpath $(SNDNN_LIB_DIR)/src)
+
+DATA_CFG ?= $(DATA_DIR)/$(APP_NAME)_params.hjson
+
+APP     ?= $(APP_NAME)
+SRCS    += $(realpath $(SRC_DIR)/net_$(APP_NAME).c)
+# SRCS    += $(realpath $(SNDNN_LIB_DIR)/src/sndnn.c)
+INCDIRS += $(DATA_DIR) $(SRC_DIR) $(INCLUDE_DIR)
+
+$(DATA_DIR)/data_$(APP_NAME).h: $(MK_DIR)/datagen.py $(DATA_CFG)
+	$< -c $(DATA_CFG) > $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_DIR)/data_$(APP_NAME).h
+
+clean: clean-data
diff --git a/target/sim/sw/device/apps/sndnn/data/gemm_params.hjson b/target/sim/sw/device/apps/sndnn/data/gemm_params.hjson
new file mode 100644
index 00000000..e3b54c27
--- /dev/null
+++ b/target/sim/sw/device/apps/sndnn/data/gemm_params.hjson
@@ -0,0 +1,17 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Parameters for a GEMM
+
+{
+    kernel: "GEMM"
+    M: 16,
+    N: 16,
+    K: 16,
+    alpha: 0,
+    transpose_A: false,
+    transpose_B: true,
+    prec: 32,
+    expand: 0
+}
diff --git a/target/sim/sw/device/apps/sndnn/datagen.py b/target/sim/sw/device/apps/sndnn/datagen.py
new file mode 100755
index 00000000..65fc63f3
--- /dev/null
+++ b/target/sim/sw/device/apps/sndnn/datagen.py
@@ -0,0 +1,874 @@
+#!/usr/bin/env python3
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Author: Tim Fischer <fischeti@iis.ee.ethz.ch>
+# Author: Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+
+import numpy as np
+import torch
+import torch.nn as nn
+import argparse
+import pathlib
+import hjson
+
+
+np.random.seed(42)
+torch.manual_seed(42)
+
+global verbose
+
+
+def array_to_cstr(a, fmt=float):
+    out = '{'
+    if fmt == float:
+        if isinstance(a, np.ndarray):
+            a = a.flat
+        if isinstance(a, torch.Tensor):
+            a = a.numpy().flat
+        for el in a:
+            out += '{}, '.format(el)
+    else:
+        for sign, exp, mant in zip(a['sign'].numpy().flat,
+                                   a['exponent'].numpy().flat,
+                                   a['mantissa'].numpy().flat):
+            value = sign * 2**7 + exp * 2**2 + mant
+            out += "0x{:02x}, ".format(value)
+    out = out[:-2] + '}'
+    return out
+
+
+def emit_header_file(layer_type: str, **kwargs):
+
+    file_path = pathlib.Path(__file__).parent / 'data'
+    emit_str = "// Copyright 2022 ETH Zurich and University of Bologna.\n" + \
+               "// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n" + \
+               "// SPDX-License-Identifier: Apache-2.0\n\n"
+
+    if layer_type == 'Conv2d':
+        file = file_path / 'data_conv2d.h'
+        emit_str += emit_conv2d_layer(**kwargs)
+    elif layer_type == 'GEMM':
+        file = file_path / 'data_gemm.h'
+        emit_str += emit_GEMM_layer(**kwargs)
+    elif layer_type == 'BatchNorm':
+        file = file_path / 'data_batchnorm.h'
+        emit_str += emit_batchnorm_layer(**kwargs)
+    elif layer_type == 'MaxPool':
+        file = file_path / 'data_maxpool.h'
+        emit_str += emit_maxpool_layer(**kwargs)
+    elif layer_type == 'FusedConv':
+        file = file_path / 'data_fusedconv.h'
+        emit_str += emit_fusedconv(**kwargs)
+    elif layer_type == 'Linear':
+        file = file_path / 'data_linear.h'
+        emit_str += emit_linear_layer(**kwargs)
+    elif layer_type == 'GELU':
+        file = file_path / 'data_gelu.h'
+        emit_str += emit_gelu_layer(**kwargs)
+    elif layer_type == 'SoftMax':
+        file = file_path / 'data_softmax.h'
+        emit_str += emit_softmax_layer(**kwargs)
+    elif layer_type == 'LayerNorm':
+        file = file_path / 'data_layernorm.h'
+        emit_str += emit_layernorm_layer(**kwargs)
+
+    with file.open('w') as f:
+        f.write(emit_str)
+
+
+def emit_layernorm_layer(name='layernorm', **kwargs):
+    ifmap = kwargs['ifmap']
+    ofmap = kwargs['ofmap']
+
+    batch_size, seq_len, embeddings = ifmap.shape
+
+    ctypes = {
+        '64': 'double',
+        '32': 'float',
+        '16': '__fp16',
+        '8': 'char'
+    }
+
+    dtype = ctypes[str(kwargs['prec'])]
+    checksum = torch.sum(ifmap, dim=-1)
+
+    layer_str = ''
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f'layernorm_layer_t {name}_l = {{\n'
+    layer_str += f'\t.BATCH_SIZE = {batch_size},\n'  # batch_size
+    layer_str += f'\t.SEQ_LEN = {seq_len},\n'        # seq_len
+    layer_str += f'\t.EMBEDDINGS = {embeddings},\n'  # embeddings
+    layer_str += f'\t.dtype = FP{kwargs["prec"]},\n'
+    layer_str += '};\n\n\n'
+
+    layer_str += f'static {dtype} {name}_result[{batch_size}][{seq_len}]'
+    layer_str += f'[{embeddings}] __attribute__((section(".data")));\n\n'
+    layer_str += f'static {dtype} {name}_ifmap_dram[{batch_size}][{seq_len}][{embeddings}] = ' \
+        + array_to_cstr(ifmap) + ';\n\n'
+    layer_str += f'static {dtype} {name}_ofmap_dram[{batch_size}][{seq_len}][{embeddings}] = ' \
+        + array_to_cstr(ofmap) + ';\n\n'
+    layer_str += f'static {dtype} {name}_checksum[{batch_size}][{seq_len}] = ' \
+        + array_to_cstr(checksum) + ';\n\n'
+
+    return layer_str
+
+
+def emit_softmax_layer(name='softmax', **kwargs):
+    ifmap = kwargs['ifmap']
+    ofmap = kwargs['ofmap']
+    reduce_dim = kwargs['reduce_dim']
+
+    batch_size, seq_len, input_samples = ifmap.shape
+
+    ctypes = {
+        '64': 'double',
+        '32': 'float',
+        '16': '__fp16',
+        '8': 'char'
+    }
+
+    dtype = ctypes[str(kwargs['prec'])]
+
+    layer_str = ''
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f'softmax_layer_t {name}_l = {{\n'
+    layer_str += f'\t.BATCH_SIZE = {batch_size},\n'  # batch_size
+    layer_str += f'\t.SEQ_LEN = {seq_len},\n'        # seq_len
+    layer_str += f'\t.INPUT_SAMPLES = {input_samples},\n'  # input_samples
+    layer_str += f'\t.REDUCE_DIM = {reduce_dim},\n'  # reduce_dim
+    layer_str += f'\t.dtype = FP{kwargs["prec"]},\n'
+    layer_str += '};\n\n\n'
+
+    checksum = torch.sum(ofmap, dim=-1)
+
+    layer_str += f'static {dtype} {name}_result[{batch_size}][{seq_len}]'
+    layer_str += f'[{input_samples}] __attribute__((section(".data")));\n\n'
+    layer_str += f'static {dtype} {name}_ifmap_dram[{batch_size}][{seq_len}][{input_samples}] = ' \
+        + array_to_cstr(ifmap) + ';\n\n'
+    layer_str += f'static {dtype} {name}_ofmap_dram[{batch_size}][{seq_len}][{input_samples}] = ' \
+        + array_to_cstr(ofmap) + ';\n\n'
+    layer_str += f'static {dtype} {name}_checksum[{batch_size}][{seq_len}] = ' \
+        + array_to_cstr(checksum) + ';\n\n'
+
+    return layer_str
+
+
+def emit_gelu_layer(name='gelu', **kwargs):
+    ifmap = kwargs['ifmap']
+    ofmap = kwargs['ofmap']
+
+    batch_size, seq_len, hidden_nodes = ifmap.shape
+    # print("batch_size: {}".format(batch_size))
+    # print("seq_len: {},".format(seq_len))
+    # print("hidden_nodes: {}".format(hidden_nodes))
+    # for i in range(batch_size):
+    #     for j in range(seq_len):
+    #         for k in range(hidden_nodes):
+    #                 print("ifmap[{}][{}][{}] = {}".format(i, j, k, ifmap[i][j][k]))
+    #                 print("ofmap[{}][{}][{}] = {}".format(i, j, k, ofmap[i][j][k]))
+
+    ctypes = {
+        '64': 'double',
+        '32': 'float',
+        '16': '__fp16',
+        '8': 'char'
+    }
+
+    dtype = ctypes[str(kwargs['prec'])]
+
+    layer_str = ''
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f'gelu_layer_t {name}_l = {{\n'
+    layer_str += f'\t.BATCH_SIZE = {batch_size},\n'  # batch_size
+    layer_str += f'\t.SEQ_LEN = {seq_len},\n'        # seq_len
+    layer_str += f'\t.HIDDEN_NODES = {hidden_nodes},\n'  # hidden_size
+    layer_str += f'\t.dtype = FP{kwargs["prec"]},\n'
+    layer_str += '};\n\n\n'
+
+    layer_str += f'static {dtype} {name}_result[{batch_size}][{seq_len}]'
+    layer_str += f'[{hidden_nodes}] __attribute__((section(".data")));\n\n'
+    layer_str += f'static {dtype} {name}_ifmap_dram[{batch_size}][{seq_len}][{hidden_nodes}] = ' \
+        + array_to_cstr(ifmap) + ';\n\n\n'
+    layer_str += f'static {dtype} {name}_ofmap_dram[{batch_size}][{seq_len}][{hidden_nodes}] = ' \
+        + array_to_cstr(ofmap) + ';\n\n\n'
+    layer_str += f'static {dtype} {name}_checksum[{batch_size}][{seq_len}] = ' \
+        + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n'
+
+    return layer_str
+
+
+def emit_linear_layer(name='linear', **kwargs):
+    ifmap = kwargs['ifmap']
+    ofmap = kwargs['ofmap']
+    weights = kwargs['weights']
+    bias = kwargs['bias']
+
+    ctypes = {
+        '64': 'double',
+        '32': 'float',
+        '16': '__fp16',
+        '8': 'char'
+    }
+
+    dtype = ctypes[str(kwargs['prec'])]
+
+    ch, ci = ifmap.shape
+    _, co = ofmap.shape
+
+    layer_str = ''
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f'linear_layer_t {name}_l = {{\n'
+    layer_str += f'\t.CO = {co},\n'  # out_features
+    layer_str += f'\t.CI = {ci},\n'  # in_features
+    layer_str += f'\t.CH = {ch},\n'  # height
+    layer_str += f'\t.CW = {ci}\n'   # width
+    layer_str += '};\n\n\n'
+
+    layer_str += f'static {dtype} {name}_result[{co*ch}] __attribute__((section(".data")));\n\n'
+    layer_str += f'static {dtype} {name}_checksum' + \
+                 f'[{co*ch}] = ' + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n'
+    layer_str += f'static {dtype} {name}_ifmap_dram' + \
+                 f'[{ch}][{ci}] = ' + array_to_cstr(ifmap) + ';\n\n\n'
+    layer_str += f'static {dtype} {name}_weights_dram' + \
+                 f'[{co}][{ci}] = ' + array_to_cstr(weights) + ';\n\n\n'
+    layer_str += f'static {dtype} {name}_bias_dram[{co}] = ' + array_to_cstr(bias) + ';\n\n\n'
+    layer_str += f'static {dtype} {name}_ofmap_dram' + \
+                 f'[{ch}][{co}] = ' + array_to_cstr(ofmap) + ';\n\n\n'
+
+    return layer_str
+
+
+def emit_conv2d_layer(name='conv2d', **kwargs):
+    ifmap = kwargs['ifmap']
+    ofmap = kwargs['ofmap']
+    weights = kwargs['weights']
+
+    n, ih, iw, ci = ifmap.shape
+    _, oh, ow, co = ofmap.shape
+    _, fh, fw, _ = weights.shape
+
+    layer_str = ''
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f'conv_layer {name}_l = {{\n'
+    layer_str += f'\t.CO = {co},\n'
+    layer_str += f'\t.CI = {ci},\n'
+    layer_str += f'\t.IH = {ih},\n'
+    layer_str += f'\t.IW = {iw},\n'
+    layer_str += f'\t.OH = {oh},\n'
+    layer_str += f'\t.OW = {ow},\n'
+    layer_str += f'\t.FH = {fh},\n'
+    layer_str += f'\t.FW = {fw}\n'
+    layer_str += '};\n\n\n'
+
+    layer_str += f'static double {name}_result' + \
+                 f'[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
+    layer_str += f'static double {name}_checksum' + \
+                 f'[{oh}][{ow}] = ' + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n'
+    layer_str += f'static double {name}_ifmap_dram' + \
+                 f'[{ih}][{iw}][{ci}] = ' + array_to_cstr(ifmap) + ';\n\n\n'
+    layer_str += f'static double {name}_weights_dram' + \
+                 f'[{co}][{ci}][{fh}][{fw}] = ' + array_to_cstr(weights) + ';\n\n\n'
+    layer_str += f'static double {name}_ofmap_dram' + \
+                 f'[{oh}][{ow}][{co}] = ' + array_to_cstr(ofmap) + ';\n\n\n'
+
+    return layer_str
+
+
+def emit_GEMM_layer(name='gemm', **kwargs):
+    mat_A = kwargs['A']
+    mat_B = kwargs['B']
+    mat_C = kwargs['C']
+    result = kwargs['result']
+
+    m = kwargs['M']
+    n = kwargs['N']
+    k = kwargs['K']
+
+    layer_str = ''
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f'gemm_layer {name}_l = {{\n'
+    layer_str += f'\t.M = {m},\n'
+    layer_str += f'\t.N = {n},\n'
+    layer_str += f'\t.K = {k},\n'
+    layer_str += f'\t.TA = {int(kwargs["ta"])},\n'
+    layer_str += f'\t.TB = {int(kwargs["tb"])},\n'
+    layer_str += f'\t.ALPHA = {kwargs["alpha"]},\n'
+    layer_str += f'\t.dtype = FP{kwargs["prec"]},\n'
+    layer_str += f'\t.expand = {kwargs["expand"]}\n'
+    layer_str += '};\n\n\n'
+
+    ctypes = {
+        '64': 'double',
+        '32': 'float',
+        '16': '__fp16',
+        '8': 'char'
+    }
+
+    dtype = ctypes[str(kwargs['prec'])]
+    if dtype != 'char':
+        layer_str += f'static {dtype} {name}_A_dram' + \
+                     f'[{m}][{k}] = ' + array_to_cstr(mat_A) + ';\n\n\n'
+        layer_str += f'static {dtype} {name}_B_dram' + \
+                     f'[{k}][{n}] = ' + array_to_cstr(mat_B) + ';\n\n\n'
+        layer_str += f'static {dtype} {name}_C_dram' + \
+                     f'[{m}][{n}] = ' + array_to_cstr(mat_C) + ';\n\n\n'
+        layer_str += f'static {dtype} {name}_result' + \
+                     f'[{m}][{n}] __attribute__((section(".data")));\n\n'
+        layer_str += f'static {dtype} {name}_checksum' + \
+                     f'[{m}] = ' + array_to_cstr(torch.sum(result, dim=-1)) + ';\n\n\n'
+    else:
+        layer_str += f'static {dtype} {name}_A_dram [{m}][{k}] = ' + \
+            array_to_cstr(kwargs['bits_A'], fmt='char') + ';\n\n\n'
+        layer_str += f'static {dtype} {name}_B_dram [{k}][{n}] = ' + \
+            array_to_cstr(kwargs['bits_B'], fmt='char') + ';\n\n\n'
+        layer_str += f'static {dtype} {name}_C_dram [{m}][{n}] = ' + \
+            array_to_cstr(kwargs['bits_C'], fmt='char') + ';\n\n\n'
+
+    return layer_str
+
+
+def emit_batchnorm_layer(name='batchnorm', **kwargs):
+
+    ifmap = kwargs['ifmap']
+    ofmap = kwargs['ofmap']
+    beta = kwargs['beta']
+    gamma = kwargs['gamma']
+
+    n, ih, iw, ci = ifmap.shape
+    _, oh, ow, co = ofmap.shape
+
+    layer_str = ''
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f'conv_layer {name}_l = {{\n'
+    layer_str += f'\t.CO = {co},\n'
+    layer_str += f'\t.CI = {ci},\n'
+    layer_str += f'\t.IH = {ih},\n'
+    layer_str += f'\t.IW = {iw},\n'
+    layer_str += f'\t.OH = {oh},\n'
+    layer_str += f'\t.OW = {ow},\n'
+    layer_str += '};\n\n\n'
+
+    layer_str += f'static double {name}_result' + \
+                 f'[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
+    layer_str += f'static double {name}_checksum' + \
+                 f'[{oh}][{ow}] = ' + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n'
+    layer_str += f'static double {name}_ifmap_dram' + \
+                 f'[{ih}][{iw}][{ci}] = ' + array_to_cstr(ifmap) + ';\n\n\n'
+    layer_str += f'static double {name}_beta_dram' + \
+                 f'[{ci}] = ' + array_to_cstr(beta) + ';\n\n\n'
+    layer_str += f'static double {name}_gamma_dram' + \
+                 f'[{ci}] = ' + array_to_cstr(gamma) + ';\n\n\n'
+    layer_str += f'static double {name}_ofmap_dram' + \
+                 f'[{oh}][{ow}][{co}] = ' + array_to_cstr(ofmap) + ';\n\n\n'
+
+    return layer_str
+
+
+def emit_maxpool_layer(name='maxpool', **kwargs):
+
+    ifmap = kwargs['ifmap']
+    ofmap = kwargs['ofmap']
+    k = kwargs['kernel_size']
+
+    n, ih, iw, ci = ifmap.shape
+    _, oh, ow, co = ofmap.shape
+
+    layer_str = ''
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f'conv_layer {name}_l = {{\n'
+    layer_str += f'\t.CO = {co},\n'
+    layer_str += f'\t.CI = {ci},\n'
+    layer_str += f'\t.IH = {ih},\n'
+    layer_str += f'\t.IW = {iw},\n'
+    layer_str += f'\t.OH = {oh},\n'
+    layer_str += f'\t.OW = {ow},\n'
+    layer_str += f'\t.FH = {k},\n'
+    layer_str += f'\t.FW = {k},\n'
+    layer_str += '};\n\n\n'
+
+    layer_str += f'static double {name}_result' + \
+                 f'[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
+    layer_str += f'static double {name}_checksum' + \
+                 f'[{oh}][{ow}] = ' + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n'
+    layer_str += f'static double {name}_ifmap_dram' + \
+                 f'[{ih}][{iw}][{ci}] = ' + array_to_cstr(ifmap) + ';\n\n\n'
+    layer_str += f'static double {name}_ofmap_dram' + \
+                 f'[{oh}][{ow}][{co}] = ' + array_to_cstr(ofmap) + ';\n\n\n'
+
+    return layer_str
+
+
+def emit_fusedconv(name='fusedconv', **kwargs):
+
+    ifmap = kwargs['ifmap']
+    kernel = kwargs['kernel']
+    bn_k = kwargs['bn_k']
+    bn_l = kwargs['bn_l']
+    ofmap = kwargs['ofmap']
+    ofmap_before = kwargs['ofmap_before']
+    ifmap_padded = kwargs['ifmap_padded']
+
+    padding = kwargs['padding']
+
+    if kwargs['depthwise']:
+        ih, iw, ci = ifmap.shape
+        oh, ow, co = ofmap.shape
+        fh, fw, co = kernel.shape
+        ci = co
+        ih_pad, iw_pad, _ = ifmap_padded.shape
+    elif kwargs['chw_layer']:
+        ci, ih, iw = ifmap.shape
+        oh, ow, co = ofmap.shape
+        co, ci, fh, fw = kernel.shape
+        _, ih_pad, iw_pad = ifmap_padded.shape
+    else:
+        ih, iw, ci = ifmap.shape
+        oh, ow, co = ofmap.shape
+        _, fh, fw, _ = kernel.shape
+        ih_pad, iw_pad, _ = ifmap_padded.shape
+
+    ctypes = {
+        '64': 'double',
+        '32': 'float',
+        '16': '__fp16',
+        '8': 'char'
+    }
+
+    dtype = ctypes[str(kwargs['prec'])]
+
+    layer_str = '#include <stdint.h>\n'
+    layer_str += '#include "conv2d.h"\n\n'
+    layer_str += 'kernel_fp32 k = {\n'
+    layer_str += f'\t.ch_in = {ci},\n'
+    layer_str += f'\t.ch_out = {co},\n'
+    layer_str += f'\t.dim_in_x = {iw},\n'
+    layer_str += f'\t.dim_in_y = {ih},\n'
+    layer_str += f'\t.dim_kernel_x = {fw},\n'
+    layer_str += f'\t.dim_kernel_y = {fh},\n'
+    layer_str += f'\t.dim_out_x = {ow},\n'
+    layer_str += f'\t.dim_out_y = {oh},\n'
+    layer_str += f'\t.padding_y_top = {padding["padding_y_top"]},\n'
+    layer_str += f'\t.padding_y_bottom = {padding["padding_y_bottom"]},\n'
+    layer_str += f'\t.padding_x_left = {padding["padding_x_left"]},\n'
+    layer_str += f'\t.padding_x_right  = {padding["padding_x_right"]},\n'
+    layer_str += f'\t.stride_x = {kwargs["stride"]["stride_x"]},\n'
+    layer_str += f'\t.stride_y = {kwargs["stride"]["stride_y"]},\n'
+    layer_str += f'\t.flag_relu = {kwargs["flags"]["flag_relu"]},\n'
+    layer_str += f'\t.flag_batch_norm = {kwargs["flags"]["flag_batch_norm"]},\n'
+    layer_str += f'\t.flag_y_accumulate_start = {kwargs["flags"]["flag_y_accumulate_start"]},\n'
+    layer_str += f'\t.flag_y_accumulate_end = {kwargs["flags"]["flag_y_accumulate_end"]},\n'
+    layer_str += '};\n\n'
+    layer_str += f'uint32_t dw = {kwargs["depthwise"]};\n'
+    layer_str += f'uint32_t chw_layer = {kwargs["chw_layer"]};\n'
+
+    layer_str += f'static {dtype} {name}_pInBuffer_dram' + \
+                 f'[{ih_pad}][{iw_pad}][{ci}] = ' + array_to_cstr(ifmap_padded) + ';\n\n'
+    layer_str += f'static {dtype} {name}_pWeight_dram' + \
+                 f'[{co}][{fh}][{fw}][{ci}] = {array_to_cstr(kernel)};\n\n'
+    layer_str += f'static {dtype} {name}_lambda_dram' + \
+                 f'[{ci}] = {array_to_cstr(bn_l)};\n\n'
+    layer_str += f'static {dtype} {name}_kappa_dram' + \
+                 f'[{ci}] = {array_to_cstr(bn_k)};\n\n'
+    layer_str += f'static {dtype} {name}_pOutBuffer_dram' + \
+                 f'[{oh}][{ow}][{co}] = {array_to_cstr(ofmap_before)};\n\n'
+    layer_str += f'static {dtype} {name}_pCheckOutBuffer_dram' + \
+                 f'[{oh}][{ow}][{co}] = {array_to_cstr(ofmap)};\n\n'
+
+    return layer_str
+
+
+def rand_data_generator(shape, prec, alt=False):
+    if prec == 64:
+        return torch.randn(shape, requires_grad=False, dtype=torch.float64), {}
+    elif prec == 32:
+        return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
+    elif prec == 16:
+        if alt:
+            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+        else:
+            return torch.randn(shape, requires_grad=False, dtype=torch.float16), {}
+    elif prec == 8:
+        sign = torch.randint(0, 2, shape,
+                             requires_grad=False, dtype=torch.uint8)  # -1 or 1
+        exponent = torch.randint(0, 16, shape,
+                                 requires_grad=False, dtype=torch.uint8)  # < 0b01111
+        mantissa = torch.randint(0, 4, shape,
+                                 requires_grad=False, dtype=torch.uint8)  # can be arbitrary
+        bits = {'sign': sign, 'exponent': exponent, 'mantissa': mantissa}
+        # TODO: not actually correct
+        sign_val = (-1.0)**sign.double()
+        exp_val = (2.0**(exponent.double()-15.0))
+        man_val = (1.0 + mantissa.double() / (2**2))
+        val = sign_val*exp_val*man_val
+        return val, bits
+
+
+def conv2d(ifmap, weights, padding=1, stride=1):
+    n, ci, ih, iw = ifmap.shape
+    co, _, fh, fw = weights.shape
+
+    conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh-1)//2, (fw-1)//2))
+    conv2d.weight = nn.Parameter(weights, requires_grad=False)
+    conv2d.bias = nn.Parameter(
+        torch.zeros_like(conv2d.bias, dtype=weights.dtype),
+        requires_grad=False)
+    ofmap = conv2d(ifmap)
+
+    return ofmap
+
+
+def max_pooling(ifmap, kernel):
+    n, ci, ih, iw = ifmap.shape
+    max_pool = nn.MaxPool2d(kernel_size=kernel)
+    ofmap = max_pool(ifmap)
+
+    return ofmap
+
+
+def batchnorm(ifmap):
+    n, ci, ih, iw = ifmap.shape
+    bn = torch.nn.BatchNorm2d(ci)
+    bn.weight.requires_grad = False
+    bn.bias.requires_grad = False
+    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
+    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+    gamma = bn.weight / torch.sqrt(running_var + bn.eps)
+    beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
+    ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
+
+    return ofmap, gamma, beta
+
+
+def fused_conv(ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise):
+
+    ih, iw, ci = ifmap.shape
+    if not depthwise:
+        co, fh, fw, _ = weights.shape
+    else:
+        fh, fw, co = weights.shape
+        ci = co
+
+    ifmap_padded = torch.zeros(ih + padding['padding_y_top'] + padding['padding_y_bottom'], iw +
+                               padding['padding_x_left'] + padding['padding_x_right'],
+                               ci,
+                               requires_grad=False, dtype=ifmap.dtype)
+    ifmap_padded[padding['padding_y_top']:ih+padding['padding_y_top'],
+                 padding['padding_x_left']:iw+padding['padding_x_left']] = ifmap
+
+    # Don't cover undefined behaviour when there are steps without a complete kernel window
+    if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride['stride_y'] != 0:
+        print("Warning: rounding h output dimension")
+    if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride['stride_x'] != 0:
+        print("Warning: rounding w output dimension")
+
+    ofmap = torch.zeros((ifmap_padded.shape[0] - (fh - 1) - 1) // stride['stride_y'] + 1,
+                        (ifmap_padded.shape[1] - (fw - 1) - 1) // stride['stride_x'] + 1, co)
+    if accumulate:
+        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+    else:
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
+
+    if verbose:
+        print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
+
+    if (depthwise):
+        # depthwise Conv2d
+        for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride['stride_y']):
+            for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride['stride_x']):
+                for c in range(co):
+                    ofmap[h//stride['stride_y'], w//stride['stride_x'],
+                          c] = torch.dot(
+                            ifmap_padded[h:h+fh, w:w+fw, c].flatten(),
+                            weights[:, :, c].flatten())
+    else:
+        # Conv2d
+        for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride['stride_y']):
+            for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride['stride_x']):
+                for c in range(co):
+                    ofmap[h//stride['stride_y'], w//stride['stride_x'],
+                          c] = torch.dot(
+                            ifmap_padded[h:h+fh, w:w+fw].flatten(),
+                            weights[c].flatten())
+
+    ofmap += ofmap_before
+
+    # BatchNorm
+    if bn:
+        ofmap = ofmap * bn_k + bn_l
+
+    # ReLU
+    if relu:
+        ofmap = torch.nn.functional.relu(ofmap)
+
+    return ofmap, ofmap_before, ifmap_padded
+
+
+def linear(ifmap, weights, bias):
+
+    ifmap = ifmap.flatten(1)
+    ofmap = torch.matmul(ifmap, weights.T) + bias
+
+    return ofmap
+
+
+def gelu(ifmap):
+    gelu = torch.nn.GELU()
+    ofmap = gelu(ifmap)
+
+    return ofmap
+
+
+def softmax(ifmap, axis):
+    softmax = torch.nn.Softmax(dim=axis)
+    ofmap = softmax(ifmap)
+
+    # print the global max of the input
+    # print("max of input: ", torch.max(ifmap))
+
+    return ofmap
+
+
+def layernorm(ifmap, eps, shape):
+    ln = torch.nn.LayerNorm(shape, eps=eps)
+    ofmap = ln(ifmap)
+
+    return ofmap
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for kernels')
+    parser.add_argument(
+        "-c",
+        "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action='store_true',
+        help='Set verbose'
+    )
+
+    args = parser.parse_args()
+
+    global verbose
+    verbose = args.verbose
+
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+
+    if param['prec'] == 64:
+        dtype = torch.float64
+    elif param['prec'] == 16:
+        dtype = torch.float16
+    elif param['prec'] == 8:
+        dtype = None
+    else:
+        dtype = torch.float32
+
+    if param['kernel'] == 'Conv2d':
+        ifmap = torch.randn(1, param['channels']['in'],
+                            param['input_dim']['height'],
+                            param['input_dim']['width'], requires_grad=False, dtype=dtype)
+        weights = torch.randn(param['channels']['out'],
+                              param['channels']['in'],
+                              param['filter']['height'],
+                              param['filter']['width'], requires_grad=False, dtype=dtype)
+
+        ofmap = conv2d(ifmap, weights,
+                       padding=param['filter']['padding'],
+                       stride=param['filter']['stride'])
+
+        # convert from CHW to HWC format
+        ifmap = ifmap.permute(0, 2, 3, 1)
+        ofmap = ofmap.permute(0, 2, 3, 1)
+        weights = weights.permute(0, 2, 3, 1)
+        kwargs = {'ifmap': ifmap, 'weights': weights, 'ofmap': ofmap}
+        emit_header_file('Conv2d', **kwargs)
+
+    elif param['kernel'] == 'GEMM':
+        mat_A, bits_A = rand_data_generator((param['M'], param['K']), param['prec'])
+        mat_B, bits_B = rand_data_generator((param['K'], param['N']), param['prec'])
+        mat_C, bits_C = rand_data_generator((param['M'], param['N']), param['prec'])
+
+        result = param['alpha'] * mat_C + torch.matmul(mat_A, mat_B)
+
+        if param['transpose_A']:
+            mat_A = mat_A.T
+        if param['transpose_B']:
+            mat_B = mat_B.T
+
+        kwargs = {
+            'A': mat_A,
+            'B': mat_B,
+            'C': mat_C,
+            'result': result,
+            'M': param['M'],
+            'N': param['N'],
+            'K': param['K'],
+            'ta': param['transpose_A'],
+            'tb': param['transpose_B'],
+            'alpha': param['alpha'],
+            'prec': param['prec'],
+            'expand': param['expand'],
+            'bits_A': bits_A,
+            'bits_B': bits_B,
+            'bits_C': bits_C
+        }
+
+        emit_header_file('GEMM', **kwargs)
+
+    elif param['kernel'] == 'BatchNorm':
+        ifmap = torch.randn(1, param['channels']['in'],
+                            param['input_dim']['height'],
+                            param['input_dim']['width'], requires_grad=False, dtype=dtype)
+
+        ofmap, gamma, beta = batchnorm(ifmap)
+
+        # convert from CHW to HWC format
+        ifmap = ifmap.permute(0, 2, 3, 1)
+        ofmap = ofmap.permute(0, 2, 3, 1)
+
+        kwargs = {'ifmap': ifmap, 'beta': beta, 'gamma': gamma, 'ofmap': ofmap}
+        emit_header_file('BatchNorm', **kwargs)
+
+    elif param['kernel'] == 'MaxPool':
+        ifmap = torch.randn(1, param['channels']['in'],
+                            param['input_dim']['height'],
+                            param['input_dim']['width'], requires_grad=False, dtype=dtype)
+
+        ofmap = max_pooling(ifmap, param['kernel_size'])
+
+        # convert from CHW to HWC format
+        ifmap = ifmap.permute(0, 2, 3, 1)
+        ofmap = ofmap.permute(0, 2, 3, 1)
+
+        kwargs = {'ifmap': ifmap, 'ofmap': ofmap, 'kernel_size': param['kernel_size']}
+        emit_header_file('MaxPool', **kwargs)
+
+    elif param['kernel'] == 'FusedConv':
+        ifmap = torch.randn(param['dim_in_y'],
+                            param['dim_in_x'],
+                            param['ch_in'], requires_grad=False, dtype=dtype)
+        if not param['depthwise']:
+            kernel = torch.randn(param['ch_out'], param['dim_kernel_y'], param['dim_kernel_x'],
+                                 param['ch_in'], requires_grad=False, dtype=dtype)
+        else:
+            kernel = torch.randn(param['dim_kernel_y'], param['dim_kernel_x'],
+                                 param['ch_in'], requires_grad=False, dtype=dtype)
+
+        bn_k = torch.randn(param['ch_out'], requires_grad=False)
+        bn_l = torch.randn(param['ch_out'], requires_grad=False)
+
+        flag_y_accumulate_start = param['flags']['flag_y_accumulate_start']
+        ofmap, ofmap_before, ifmap_padded = fused_conv(ifmap,
+                                                       kernel,
+                                                       bn_k,
+                                                       bn_l,
+                                                       param['padding'],
+                                                       param['stride'],
+                                                       param['flags']['flag_batch_norm'],
+                                                       param['flags']['flag_relu'],
+                                                       not flag_y_accumulate_start,
+                                                       param['depthwise'])
+
+        if param['chw_layer']:
+            ifmap = ifmap.permute(2, 0, 1)
+            ifmap_padded = ifmap_padded.permute(2, 0, 1)
+            kernel = kernel.permute(0, 3, 1, 2)
+
+        kwargs = {
+            'ifmap': ifmap,
+            'ifmap_padded': ifmap_padded,
+            'ofmap': ofmap,
+            'ofmap_before': ofmap_before,
+            'kernel': kernel,
+            'bn_k': bn_k,
+            'bn_l': bn_l,
+            'padding': param['padding'],
+            'stride': param['stride'],
+            'prec': param['prec'],
+            'flags': param['flags'],
+            'depthwise': param['depthwise'],
+            'chw_layer': param['chw_layer']
+        }
+        emit_header_file('FusedConv', **kwargs)
+
+    elif param['kernel'] == 'Linear':
+        # in_features = param['input_dim']['width']
+        # out_features = param['channels']['out']
+        ifmap = torch.randn(param['input_dim']['height'],
+                            param['input_dim']['width'], requires_grad=False, dtype=dtype)
+        weights = torch.randn(param['channels']['out'],
+                              param['input_dim']['width'], requires_grad=False, dtype=dtype)
+        bias = torch.randn(param['channels']['out'], requires_grad=False, dtype=dtype)
+        ofmap = linear(ifmap, weights, bias)
+
+        kwargs = {
+            'ifmap': ifmap,
+            'weights': weights,
+            'bias': bias,
+            'ofmap': ofmap,
+            'prec': param['prec'],
+        }
+        emit_header_file('Linear', **kwargs)
+
+    elif param['kernel'] == 'GELU':
+        ifmap = torch.randn(param['input_dim']['batch_size'], param['input_dim']['seq_len'],
+                            param['input_dim']['hidden_nodes'], requires_grad=False, dtype=dtype)
+        ofmap = gelu(ifmap)
+
+        kwargs = {
+            'ifmap': ifmap,
+            'ofmap': ofmap,
+            'prec': param['prec'],
+        }
+
+        emit_header_file('GELU', **kwargs)
+
+    elif param['kernel'] == 'SoftMax':
+        ifmap = torch.randn(param['input_dim']['batch_size'], param['input_dim']['seq_len'],
+                            param['input_dim']['input_samples'], requires_grad=False, dtype=dtype)
+        ofmap = softmax(ifmap, param['reduce_dim'])
+
+        kwargs = {
+            'ifmap': ifmap,
+            'ofmap': ofmap,
+            'reduce_dim': param['reduce_dim'],
+            'prec': param['prec'],
+        }
+
+        emit_header_file('SoftMax', **kwargs)
+
+    elif param['kernel'] == 'LayerNorm':
+        ifmap = torch.randn(param['input_dim']['batch_size'], param['input_dim']['seq_len'],
+                            param['input_dim']['embeddings'], requires_grad=False, dtype=dtype)
+
+        eps = param['eps']
+
+        ofmap = layernorm(ifmap, eps, param['input_dim']['embeddings'])
+
+        ofmap = ofmap.detach().numpy()
+
+        # print("LayerNorm output shape: ", ofmap.shape)
+        # print("LayerNorm output: ", ofmap)
+
+        kwargs = {
+            'ifmap': ifmap,
+            'ofmap': ofmap,
+            'prec': param['prec'],
+        }
+
+        emit_header_file('LayerNorm', **kwargs)
+
+    else:
+        print("No valid kernel selected")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/sim/sw/device/apps/sndnn/gemm/Makefile b/target/sim/sw/device/apps/sndnn/gemm/Makefile
new file mode 100644
index 00000000..ddd45cba
--- /dev/null
+++ b/target/sim/sw/device/apps/sndnn/gemm/Makefile
@@ -0,0 +1,18 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Gianna Paulin <pauling@iis.ee.ethz.ch>
+
+# DNN_DIR = $(abspath ../../../../../../../sw/snDNN)
+DNN_DIR = $(abspath ..)
+# SNDNN_DIR = $(abspath ..)
+APPS_DIR = $(abspath ../..)
+
+APP_NAME = gemm
+USE_SNDNN_LIBRARY = true
+
+include $(DNN_DIR)/Makefile
+include $(APPS_DIR)/common.mk
+
+$(DEP): $(DATA_DIR)/data_$(APP_NAME).h
diff --git a/target/sim/sw/device/apps/sndnn/gemm/src/net_gemm.c b/target/sim/sw/device/apps/sndnn/gemm/src/net_gemm.c
new file mode 100644
index 00000000..8bc0876f
--- /dev/null
+++ b/target/sim/sw/device/apps/sndnn/gemm/src/net_gemm.c
@@ -0,0 +1,250 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// SW testbench for profiling GEMM kernels in different
+// floating point precisions (fp64, fp32, fp16), as well as
+// different memory layouts for matrices (transposed/not-transposed)
+// Correctness of results are checked automatically
+
+#include "data_gemm.h"
+#include "sndnn.h"
+// #include "gemm.h"
+#include "layer.h"
+#include "math.h"
+// #include "perf_cnt.h"
+// #include "printf.h"
+#include "snrt.h"
+// #include "utils.h"
+
+// Other variables
+__thread volatile comm_buffer_t* comm_buffer;
+
+// Padding of innermost dimension of a Matrix
+// Useful for preventing banking conflicts between cores
+// that are accessing different rows of the matrix
+#define MAT_ROW_PADDING 0
+
+// Padding in between matrices A, B for preventing
+// banking conflicts in the beginning
+#define MAT_PADDING 0
+
+#define CHECK_RESULT
+
+void *share_ptr;
+
+int main() {
+
+/** OFFLOAD SECTION START **/
+
+    // Initialize pointers
+    comm_buffer = (volatile comm_buffer_t*)get_communication_buffer();
+
+    // Notify CVA6 when snRuntime initialization is done
+    post_wakeup_cl();
+    return_to_cva6(SYNC_ALL);
+    snrt_wfi();
+
+    // Reset state after wakeup
+    // mcycle();
+    post_wakeup_cl();
+
+/** OFFLOAD SECTION END **/
+
+    gemm_l.A = (void *)gemm_A_dram;
+    gemm_l.B = (void *)gemm_B_dram;
+    gemm_l.C = (void *)gemm_C_dram;
+
+    const gemm_layer l1_gemm_l = gemm_l;
+
+    const uint32_t cluster_num = snrt_cluster_num();
+    const uint32_t cluster_id = snrt_cluster_idx();
+    const uint32_t compute_num = snrt_cluster_compute_core_num();
+    const uint32_t compute_id = snrt_global_core_idx();
+
+    void *mat_A, *mat_B, *mat_C;
+
+    uint32_t mat_A_size =
+        (l1_gemm_l.M * (l1_gemm_l.K + MAT_ROW_PADDING) + MAT_PADDING) *
+        l1_gemm_l.dtype;
+    uint32_t mat_B_size =
+        (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.N * l1_gemm_l.dtype;
+    uint32_t mat_C_size = l1_gemm_l.M * l1_gemm_l.N * l1_gemm_l.dtype;
+
+    uint32_t total_size = mat_A_size + mat_B_size + mat_C_size;
+
+    void *ptr;
+
+    if (compute_id == 0) {
+        ptr = snrt_l1alloc(total_size);
+        share_ptr = ptr;
+    }
+
+    snrt_cluster_hw_barrier();
+
+    ptr = share_ptr;
+
+    mat_A = ptr;
+    ptr += (l1_gemm_l.M * (l1_gemm_l.K + MAT_ROW_PADDING) + MAT_PADDING) *
+           l1_gemm_l.dtype;
+    mat_B = ptr;
+    ptr += (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.N * l1_gemm_l.dtype;
+    mat_C = ptr;
+    ptr += l1_gemm_l.M * l1_gemm_l.N * l1_gemm_l.dtype;
+
+    uint32_t errors = 0;
+
+    snrt_global_barrier();
+
+    if (snrt_is_dm_core()) {
+        snrt_dma_txid_t txid_A =
+            snrt_dma_start_2d(mat_A, l1_gemm_l.A, l1_gemm_l.dtype * l1_gemm_l.K,
+                              l1_gemm_l.dtype * (l1_gemm_l.K + MAT_ROW_PADDING),
+                              l1_gemm_l.dtype * l1_gemm_l.K, l1_gemm_l.M);
+        snrt_dma_txid_t txid_B =
+            snrt_dma_start_2d(mat_B, l1_gemm_l.B, l1_gemm_l.dtype * l1_gemm_l.K,
+                              l1_gemm_l.dtype * (l1_gemm_l.K + MAT_ROW_PADDING),
+                              l1_gemm_l.dtype * l1_gemm_l.K, l1_gemm_l.N);
+
+        snrt_dma_txid_t txid_C = snrt_dma_start_1d(
+            mat_C, l1_gemm_l.C, l1_gemm_l.dtype * l1_gemm_l.M * l1_gemm_l.N);
+
+        snrt_dma_wait_all();
+    }
+
+    // snrt_cluster_hw_barrier();
+    snrt_global_barrier();
+
+    // if (snrt_is_compute_core() &&
+    //     snrt_cluster_compute_core_num() < compute_num) {
+    if (snrt_is_compute_core()) {
+        const uint32_t setup_SSR = 1;
+
+        if (!l1_gemm_l.TA && !l1_gemm_l.TB) {
+            volatile uint32_t A_offset =
+                compute_id * (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.dtype;
+            volatile uint32_t C_offset =
+                compute_id * l1_gemm_l.N * l1_gemm_l.dtype;
+            volatile uint32_t ldA =
+                compute_num * (l1_gemm_l.K + MAT_ROW_PADDING);
+            volatile uint32_t ldB = l1_gemm_l.N + MAT_ROW_PADDING;
+            volatile uint32_t ldC = l1_gemm_l.N * compute_num;
+
+            benchmark_get_cycle();
+            gemm_fp64_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, l1_gemm_l.K,
+                          &mat_A[A_offset], ldA, l1_gemm_l.TA, mat_B, ldB,
+                          l1_gemm_l.TB, &mat_C[C_offset], ldC, &l1_gemm_l.ALPHA,
+                          setup_SSR);
+            benchmark_get_cycle();
+        } else if (!l1_gemm_l.TA && l1_gemm_l.TB) {
+            volatile uint32_t A_offset =
+                compute_id * (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.dtype;
+            volatile uint32_t C_offset =
+                compute_id * l1_gemm_l.N * l1_gemm_l.dtype;
+            volatile uint32_t ldA =
+                compute_num * (l1_gemm_l.K + MAT_ROW_PADDING);
+            volatile uint32_t ldB = l1_gemm_l.K + MAT_ROW_PADDING;
+            volatile uint32_t ldC = l1_gemm_l.N * compute_num;
+
+            benchmark_get_cycle();
+            switch (l1_gemm_l.dtype) {
+                case FP64:
+                    gemm_fp64_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N,
+                                  l1_gemm_l.K, &mat_A[A_offset], ldA,
+                                  l1_gemm_l.TA, mat_B, ldB, l1_gemm_l.TB,
+                                  &mat_C[C_offset], ldC, &l1_gemm_l.ALPHA,
+                                  setup_SSR);
+                    break;
+                case FP32:
+                    gemm_fp32_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N,
+                                  l1_gemm_l.K, &mat_A[A_offset], ldA, mat_B,
+                                  ldB, &mat_C[C_offset], ldC, &l1_gemm_l.ALPHA,
+                                  setup_SSR);
+                    break;
+                case FP16:
+                    if (l1_gemm_l.expand) {
+                        gemm_fp16_ex_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N,
+                                         l1_gemm_l.K, &mat_A[A_offset], ldA,
+                                         mat_B, ldB, &mat_C[C_offset], ldC,
+                                         &l1_gemm_l.ALPHA, setup_SSR);
+                    } else {
+                        gemm_fp16_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N,
+                                      l1_gemm_l.K, &mat_A[A_offset], ldA, mat_B,
+                                      ldB, &mat_C[C_offset], ldC,
+                                      &l1_gemm_l.ALPHA, setup_SSR);
+                    }
+                    break;
+                case FP8:
+                    gemm_fp8_ex_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N,
+                                    l1_gemm_l.K, &mat_A[A_offset], ldA, mat_B,
+                                    ldB, &mat_C[C_offset], ldC,
+                                    &l1_gemm_l.ALPHA, setup_SSR);
+                    break;
+            }
+            benchmark_get_cycle();
+        } else if (l1_gemm_l.TA) {
+            // printf("transpose TA not supported\n");
+        }
+        snrt_cluster_hw_barrier();
+    } else {
+        snrt_cluster_hw_barrier();
+    }
+    // snrt_cluster_hw_barrier();
+    snrt_global_barrier();
+
+#ifdef CHECK_RESULT
+
+    if (compute_id == 0) {
+        if (l1_gemm_l.dtype == FP64) {
+            for (uint32_t m = 0; m < l1_gemm_l.M; m++) {
+                double checksum = gemm_checksum[m];
+                double sum = 0.0;
+                for (uint32_t n = 0; n < l1_gemm_l.N; n++) {
+                    sum += ((double *)mat_C)[m * l1_gemm_l.N + n];
+                }
+                if (fabs(sum - checksum) > 0.001) {
+                    errors += l1_gemm_l.N;
+                }
+            }
+        } else if (l1_gemm_l.dtype == FP32) {
+            for (uint32_t m = 0; m < l1_gemm_l.M; m++) {
+                float checksum = gemm_checksum[m];
+                float sum = 0.0;
+                for (uint32_t n = 0; n < l1_gemm_l.N; n++) {
+                    sum += ((float *)mat_C)[m * l1_gemm_l.N + n];
+                }
+                if (fabs(sum - checksum) > 0.001) {
+                    errors += l1_gemm_l.N;
+                }
+            }
+        } else if (l1_gemm_l.dtype == FP16) {
+            for (uint32_t m = 0; m < l1_gemm_l.M; m++) {
+                __fp16 checksum = gemm_checksum[m];
+                float sum = 0.0;
+                for (uint32_t n = 0; n < l1_gemm_l.N; n++) {
+                    sum += ((__fp16 *)mat_C)[m * l1_gemm_l.N + n];
+                }
+                if (fabs(sum - checksum) > 0.05) {
+                    errors += l1_gemm_l.N;
+                }
+            }
+        } else if (l1_gemm_l.dtype == FP8) {
+            // printf("No golden model yet for fp8!\n");
+        }
+        // printf("%d/%d Errors\n", errors, l1_gemm_l.M * l1_gemm_l.N);
+    }
+
+/** OFFLOAD SECTION START **/
+    comm_buffer->usr_data_ptr = errors;
+/** OFFLOAD SECTION END **/
+
+#endif
+
+/** OFFLOAD SECTION START **/
+    snrt_global_barrier();
+    return_to_cva6(SYNC_ALL);
+/** OFFLOAD SECTION END **/
+
+    // TODO: change back!!!
+    return 0;
+}
diff --git a/target/sim/sw/device/libraries/snDNN/Makefile b/target/sim/sw/device/libraries/snDNN/Makefile
new file mode 100644
index 00000000..34b20158
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/Makefile
@@ -0,0 +1,127 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+# Gianna Paulin <pauling@iis.ee.ethz.ch>
+
+# Usage of absolute paths is required to externally include
+# this Makefile from multiple different locations
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+include $(MK_DIR)/../../toolchain.mk
+
+SNITCH_ROOT := $(realpath $(MK_DIR)/../../../../../../deps/snitch_cluster/)
+
+###################
+# Build variables #
+###################
+
+#############
+# snRuntime #
+#############
+# Directories
+
+CUR_DIR     := $(realpath $(MK_DIR)/.)
+SNRT_DIR    := $(realpath $(SNITCH_ROOT)/sw/snRuntime)
+SNRT_DIR    := $(shell bender path snitch_cluster)/sw/snRuntime
+SNDNN_DIR   := $(shell bender path snitch_cluster)/sw/snDNN
+
+# SNRT_DIR    := $(realpath $(MK_DIR)/../../../../../sw/snRuntime)
+# ifeq (SELECT_RUNTIME, banshee)
+RUNTIME_DIR := $(realpath $(MK_DIR)/../../runtime)
+# else
+# RUNTIME_DIR := $(realpath $(MK_DIR)/../../runtime/rtl)
+# endif
+
+# Dependencies
+INCDIRS += $(RUNTIME_DIR)/src
+# INCDIRS += $(RUNTIME_DIR)/../../shared
+INCDIRS += $(RUNTIME_DIR)/../../shared/platform
+INCDIRS += $(RUNTIME_DIR)/../../shared/platform/generated
+INCDIRS += $(RUNTIME_DIR)/../../shared/runtime
+INCDIRS += $(SNRT_DIR)/api
+INCDIRS += $(SNRT_DIR)/api/omp
+INCDIRS += $(SNRT_DIR)/src
+INCDIRS += $(SNRT_DIR)/src/omp
+INCDIRS += $(SNRT_DIR)/vendor/riscv-opcodes
+
+# Directories
+BUILDDIR = $(abspath $(CUR_DIR)/build/)
+SRC_DIR  = $(abspath $(CUR_DIR)/src/)
+SW_DIR   = $(abspath $(CUR_DIR)/../../)
+# SNRT_DIR = $(shell bender path snitch_cluster)/sw/snRuntime
+# SNDNN_DIR  = $(shell bender path snitch_cluster)/sw/snDNN
+
+# Dependencies
+# INCDIRS += $(SNRT_DIR)/src
+# INCDIRS += $(SNRT_DIR)/src/omp
+# INCDIRS += $(SNRT_DIR)/api
+# INCDIRS += $(SNRT_DIR)/api/omp
+# INCDIRS += $(SNRT_DIR)/vendor/riscv-opcodes
+# INCDIRS += $(SW_DIR)/shared/platform
+# INCDIRS += $(SW_DIR)/shared/platform/generated
+# INCDIRS += $(SW_DIR)/shared/runtime
+# SRCS += $(SRC_DIR)/occamy_start.S
+# SRCS += $(SRC_DIR)/snrt.c
+
+#########
+# snDNN #
+#########
+# Directories
+BUILDDIR      = $(abspath build/)
+# SNDNN_DIR     = $(realpath $(SNITCH_ROOT)/sw/snDNN)
+SRC_DIR       = $(SNDNN_DIR)/src
+SRC_DIR_LAYER = $(CUR_DIR)/src
+
+# Dependencies
+INCDIRS += $(SNDNN_DIR)/src
+INCDIRS += $(SNDNN_DIR)/include
+INCDIRS += $(CUR_DIR)/src/
+
+SRCS += $(CUR_DIR)/src/sndnn.c
+
+###########
+# Outputs #
+###########
+
+OBJS        = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS)))))
+DEPS        = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS)))))
+LIB         = $(BUILDDIR)/libsnDNN.a
+DUMP        = $(BUILDDIR)/libsnDNN.dump
+ALL_OUTPUTS = $(LIB) $(DUMP)
+
+#########
+# Rules #
+#########
+
+.PHONY: all
+all: $(ALL_OUTPUTS)
+
+.PHONY: clean
+clean:
+	rm -rf $(BUILDDIR)
+
+$(BUILDDIR):
+	mkdir -p $@
+
+$(BUILDDIR)/%.o: $(SRC_DIR_LAYER)/%.S | $(BUILDDIR)
+	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
+
+$(BUILDDIR)/%.o: $(SRC_DIR_LAYER)/%.c | $(BUILDDIR)
+	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
+
+$(BUILDDIR)/%.d: $(SRC_DIR_LAYER)/%.c | $(BUILDDIR)
+	$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@
+
+###########
+# Library #
+###########
+$(LIB): $(OBJS) | $(BUILDDIR)
+	$(RISCV_AR) $(RISCV_ARFLAGS) $@ $^
+
+$(DUMP): $(LIB) | $(BUILDDIR)
+	$(RISCV_OBJDUMP) -D $< > $@
+
+ifneq ($(MAKECMDGOALS),clean)
+-include $(DEPS)
+endif
diff --git a/target/sim/sw/device/libraries/snDNN/src/batchnorm_layer.c b/target/sim/sw/device/libraries/snDNN/src/batchnorm_layer.c
new file mode 100644
index 00000000..64c07537
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/batchnorm_layer.c
@@ -0,0 +1,138 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "batchnorm_layer.h"
+
+#include "batchnorm.h"
+#include "layer.h"
+// #include "printf.h"
+#include "snrt.h"
+
+void batchnorm_layer(const conv_layer *l) {
+    const uint32_t cluster_num = snrt_cluster_num();
+    const uint32_t cluster_id = snrt_cluster_idx();
+    const uint32_t compute_num = snrt_cluster_compute_core_num();
+    const uint32_t compute_id = snrt_cluster_core_idx();
+
+    // Each cluster loads one tile of a row
+    uint32_t ifmap_size = 2 * l->IW * l->TILE_CI;
+    uint32_t weights_size = l->CI;
+    uint32_t ofmap_size = 2 * l->IW * l->TILE_CI;
+
+    double *ptr = (double *)snrt_l1_start_addr();
+    double *ifmap = ptr;
+    ptr += ifmap_size;
+    double *gamma = ptr;
+    ptr += weights_size;
+    double *beta = ptr;
+    ptr += weights_size;
+    double *ofmap = ptr;
+    ptr += ofmap_size;
+
+    uint32_t read_buf = 0;
+    uint32_t write_buf = 0;
+
+    uint32_t prev_oh;
+    uint32_t prev_ow;
+    uint32_t prev_ci;
+
+    for (uint32_t oh = cluster_id; oh < l->OH; oh += cluster_num) {
+        for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) {
+            if (snrt_is_dm_core()) {
+                // Load weights once in the beginning
+                if (oh == cluster_id && ci == 0) {
+                    snrt_dma_start_1d(gamma, l->gamma, sizeof(double) * l->CI);
+                    snrt_dma_start_1d(beta, l->beta, sizeof(double) * l->CI);
+                    snrt_dma_wait_all();
+                }
+
+                // Load some stuff
+                if (l->TILE_CI == l->CI) {
+                    // data layout is consecutively in memory
+                    snrt_dma_start_1d(&ifmap[write_buf * ifmap_size / 2],
+                                      &l->ifmap[oh * l->IW * l->CI],
+                                      sizeof(double) * l->IW * l->TILE_CI);
+                } else {
+                    // data is interleaved
+                    snrt_dma_start_2d(
+                        &ifmap[write_buf * ifmap_size / 2], /* dst */
+                        &l->ifmap[oh * l->IW * l->CI + ci], /* src */
+                        sizeof(double) * l->TILE_CI,        /* size */
+                        sizeof(double) * l->TILE_CI,        /* dst_stride */
+                        sizeof(double) * l->CI,             /* src_stride */
+                        l->IW);                             /* repetitions */
+                }
+
+                snrt_dma_wait_all();
+
+                snrt_cluster_hw_barrier();
+
+                if (!(oh == cluster_id && ci == 0)) {
+                    if (l->TILE_CI == l->CI) {
+                        // data is stored consecutively
+                        snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI],
+                                          &ofmap[!read_buf * (ofmap_size / 2)],
+                                          sizeof(double) * l->IW * l->CI);
+                    } else {
+                        // data is stored in interleaved layout
+                        snrt_dma_start_2d(
+                            &l->ofmap[prev_oh * l->OW * l->CI +
+                                      prev_ci],                   /* dst */
+                            &ofmap[!read_buf * (ofmap_size / 2)], /* src */
+                            sizeof(double) * l->TILE_CI,          /* size */
+                            sizeof(double) * l->CI,      /* dst_stride */
+                            sizeof(double) * l->TILE_CI, /* src_stride */
+                            l->IW);                      /* repetitions */
+                    }
+                }
+
+                snrt_dma_wait_all();
+                write_buf = !write_buf;
+                read_buf = !read_buf;
+                prev_ci = ci;
+                prev_oh = oh;
+                /* prev_ow = ow; */
+            }
+
+            if (snrt_is_compute_core()) {
+                // Wait for data
+                snrt_cluster_hw_barrier();
+                // initially setup SSRs
+                uint32_t setup_SSR = (oh == cluster_id && ci == 0);
+
+                // Start kernel
+                batchnorm_fp64(&ifmap[read_buf * ofmap_size / 2 + compute_id],
+                               &gamma[ci + compute_id], &beta[ci + compute_id],
+                               &ofmap[write_buf * ofmap_size / 2 + compute_id],
+                               l->OW, l->TILE_CI, compute_num, setup_SSR);
+
+                write_buf = !write_buf;
+                read_buf = !read_buf;
+            }
+        }
+    }
+
+    snrt_cluster_hw_barrier();
+
+    // Store last tile back
+    if (snrt_is_dm_core()) {
+        if (l->TILE_CI == l->CI) {
+            // data is stored consecutively
+            snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI],
+                              &ofmap[!read_buf * (ofmap_size / 2)],
+                              sizeof(double) * l->IW * l->CI);
+        } else {
+            // data is stored in interleaved layout
+            snrt_dma_start_2d(
+                &l->ofmap[prev_oh * l->OW * l->CI + prev_ci], /* dst */
+                &ofmap[!read_buf * (ofmap_size / 2)],         /* src */
+                sizeof(double) * l->TILE_CI,                  /* size */
+                sizeof(double) * l->CI,                       /* dst_stride */
+                sizeof(double) * l->TILE_CI,                  /* src_stride */
+                l->IW);                                       /* repetitions */
+        }
+
+        snrt_dma_wait_all();
+    }
+}
diff --git a/target/sim/sw/device/libraries/snDNN/src/conv2d_layer.c b/target/sim/sw/device/libraries/snDNN/src/conv2d_layer.c
new file mode 100644
index 00000000..27ad83c8
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/conv2d_layer.c
@@ -0,0 +1,396 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "conv2d_layer.h"
+
+#include "gemm.h"
+#include "layer.h"
+// #include "printf.h"
+#include "snrt.h"
+#include "utils.h"
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+void conv2d_layer(const conv_layer *l) {
+    uint32_t cluster_num = snrt_cluster_num();
+    uint32_t cluster_id = snrt_cluster_idx();
+    uint32_t compute_num = snrt_cluster_compute_core_num();
+    uint32_t compute_id = snrt_cluster_compute_core_num();
+
+    const uint32_t cluster_per_quadrant = min(4, cluster_num);
+
+    // typedef struct cluster_mem_alloc_struct {
+    //     double im2col[2][compute_num][l->FW*l->FH*l->TILE_CI+1];
+    //     double ifmap[2][l->FH][compute_num + l->FW - 1][l->TILE_CI];
+    //     double weights[compute_num][l->FH*l->FW*l->TILE_CI+1];
+    //     double ofmap[2][compute_num][8];
+    //     volatile uint32_t synch_flag[2];
+    // } cluster_mem_alloc;
+
+    // im2col[2][compute_num][l->FW*l->FH*l->TILE_CI+1];
+    uint32_t im2col_row_stride = l->FW * l->FH * l->TILE_CI + 1;
+    uint32_t im2col_mat_stride = im2col_row_stride * compute_num;
+    uint32_t im2col_size = 2 * im2col_mat_stride;
+
+    // ifmap[2][l->FH][compute_num + l->FW - 1][l->TILE_CI];
+    uint32_t ifmap_col_stride = l->TILE_CI;
+    uint32_t ifmap_row_stride = ifmap_col_stride * (compute_num + l->FW - 1);
+    uint32_t ifmap_stride = ifmap_row_stride * l->FH;
+    uint32_t ifmap_size = 2 * ifmap_stride;
+
+    // weights[compute_num][l->FH*l->FW*l->TILE_CI+1];
+    uint32_t weights_co_stride = l->FH * l->FW * l->TILE_CI + 1;
+    uint32_t weights_size = compute_num * weights_co_stride;
+
+    // ofmap[2][compute_num][8];
+    uint32_t ofmap_co_stride = 8;
+    uint32_t ofmap_stride = compute_num * ofmap_co_stride;
+    uint32_t ofmap_size = 2 * ofmap_stride;
+
+    double *ptr = (double *)snrt_l1_next();
+    double *im2col = ptr;
+    ptr += im2col_size;
+    double *ifmap = ptr;
+    ptr += ifmap_size;
+    double *weights = ptr;
+    ptr += weights_size;
+    double *ofmap = ptr;
+    ptr += ofmap_size;
+    volatile uint32_t *synch_flag = (void *)ptr;
+
+    uint32_t write_buf = 0;
+    uint32_t read_buf = 0;
+
+    int32_t oh_prev = -1;
+    int32_t ow_prev = -1;
+
+    // snrt_global_barrier();
+
+    benchmark_get_cycle();
+
+    // Distribute output channels across clusters
+    for (uint32_t co = cluster_id * compute_num; co < l->CO;
+         co += cluster_num * compute_num) {
+        // Tile CI dimension
+        for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) {
+            benchmark_get_cycle();
+
+            // Load weights in the beginning
+            if (snrt_is_dm_core()) {
+                snrt_dma_start_tracking();
+
+                // Weights are stored in CO x FH x FW x CI format with
+                // additional padding (CI + 1) to prevent banking conflicts
+                for (uint32_t _co = 0; _co < 8; _co++) {
+                    if (l->TILE_CI == l->CI) {
+                        snrt_dma_txid_t weight_txid = snrt_dma_start_1d(
+                            &weights[_co * weights_co_stride], /* dst */
+                            &l->weights[(co + _co) * l->FH * l->FW *
+                                        l->CI], /* src */
+                            sizeof(double) * l->CI * l->FH * l->FW /* size */);
+                    } else {
+                        snrt_dma_txid_t weight_txid = snrt_dma_start_2d(
+                            &weights[_co * weights_co_stride], /* dst */
+                            &l->weights[(co + _co) * l->FH * l->FW * l->CI +
+                                        ci],             /* src */
+                            sizeof(double) * l->TILE_CI, /* size */
+                            sizeof(double) * l->TILE_CI, /* dst_stride */
+                            sizeof(double) * l->CI,      /* src_stride */
+                            l->FH * l->FW /* repetitions */);
+                    }
+                }
+                snrt_dma_wait_all();
+
+                snrt_dma_stop_tracking();
+            }
+            benchmark_get_cycle();
+
+            // Iterate over pixels, outer loop iterates over tiles of columns in
+            // feature map, inner loop iterates over rows. Each core processes
+            // one pixel at a time. In case of cluster2cluster communication,
+            // each cluster in a quadrant starts with a different row. The first
+            // time, all clusters load a different row from memory. In each
+            // subsequent iteration the leading cluster loads a new row from
+            // main memory and the others load from the next cluster
+            for (uint32_t ow = 0; ow < l->OW; ow += compute_num) {
+                if (l->cluster2cluster) {
+                    synch_flag[0] = 0;
+                    synch_flag[1] = 0;
+                }
+
+                for (uint32_t _oh = 0; _oh < l->OH; _oh++) {
+                    // If cluster2cluster is enabled, each cluster starts with a
+                    // different row, requires that OH is bigger than
+                    // cluster_num (per quadrant at least)
+                    uint32_t oh = ((cluster_per_quadrant - 1) -
+                                   (cluster_id % cluster_per_quadrant) + _oh) %
+                                  l->OH;
+
+                    if (snrt_is_dm_core()) {
+                        uint32_t n_ifmap_pixel_read =
+                            min(compute_num + l->FW - 1,
+                                l->IW - ow + (l->pad << 1));
+                        uint32_t n_ofmap_pixel_read =
+                            min(compute_num, l->OW - ow);
+                        uint32_t n_ofmap_pixel_write =
+                            min(compute_num, l->OW - ow_prev);
+
+                        // Load the intermediate outputs from memory
+                        if (ci != 0) {
+                            snrt_dma_txid_t ofmap_txid = snrt_dma_start_2d(
+                                &ofmap[write_buf * ofmap_stride], /* dst */
+                                &l->ofmap[(oh * l->OW + ow) * l->CO +
+                                          co],          /* src */
+                                sizeof(double) * 8,     /* size */
+                                sizeof(double) * 8,     /* dst_stride */
+                                sizeof(double) * l->CO, /* src_stride */
+                                n_ofmap_pixel_read);    /* repetitions */
+                            snrt_dma_wait_all();
+                        } else {
+                            dma_memset(&ofmap[write_buf * ofmap_stride], 0,
+                                       sizeof(double) * 8 * n_ofmap_pixel_read);
+                        }
+
+                        if (l->cluster2cluster) {
+                            // All except last cluster need to wait until
+                            // cluster synch flag is cleared
+                            if (cluster_id % cluster_per_quadrant !=
+                                cluster_per_quadrant - 1) {
+                                while (synch_flag[write_buf])
+                                    ;
+                            }
+                        }
+
+                        snrt_dma_start_tracking();
+
+                        // The input feature map needs to be loaded from main
+                        // memory in the following cases: 1) cluster2cluster
+                        // communication is not enabled 2) The first iteration,
+                        // every cluster loads a row from main memory 3) The
+                        // leading cluster always loads rows from main memory
+                        if (!l->cluster2cluster || _oh == 0 ||
+                            cluster_id % cluster_per_quadrant == 0) {
+                            // Transfer in FH * (compute_num + FW - 1) pixels
+                            // such that im2col transformation can be performed
+                            // for every core
+
+                            for (uint32_t fh = 0; fh < l->FH; fh++) {
+                                // Fill horizontal lines with zeros for padding
+                                if (oh + fh < l->pad ||
+                                    oh + fh >= l->IH + ((l->FH - 1) >> 1)) {
+                                    dma_memset(&ifmap[write_buf * ifmap_stride +
+                                                      fh * ifmap_row_stride],
+                                               0,
+                                               sizeof(double) * l->TILE_CI *
+                                                   n_ifmap_pixel_read);
+                                } else {
+                                    uint32_t padding_left =
+                                        (ow < l->pad) ? (l->pad - ow) : 0;
+                                    uint32_t padding_right =
+                                        (ow + compute_num + l->pad <= l->OW)
+                                            ? 0
+                                            : n_ifmap_pixel_read -
+                                                  ((l->FW - 1) >> 1) -
+                                                  (l->IW - ow);
+
+                                    // If there is need for padding, set whole
+                                    // buffer to zero
+                                    if (padding_left || padding_right) {
+                                        dma_memset(
+                                            &ifmap[write_buf * ifmap_stride +
+                                                   fh * ifmap_row_stride],
+                                            0,
+                                            sizeof(double) *
+                                                (compute_num + l->FW - 1) *
+                                                l->TILE_CI);
+                                    }
+
+                                    // Then fill in the rest of the values
+                                    snrt_dma_txid_t ifmap_txid =
+                                        snrt_dma_start_2d(
+                                            &ifmap[write_buf * ifmap_stride +
+                                                   fh * ifmap_row_stride +
+                                                   padding_left *
+                                                       ifmap_col_stride], /* dst
+                                                                           */
+                                            (double *)&l->ifmap
+                                                [((oh + fh - l->pad) * l->IW +
+                                                  ow -
+                                                  (l->pad - padding_left)) *
+                                                     l->CI +
+                                                 ci], /* src */
+                                            sizeof(double) *
+                                                l->TILE_CI, /* size */
+                                            sizeof(double) *
+                                                l->TILE_CI, /* dst_stride */
+                                            sizeof(double) *
+                                                l->CI, /* src_stride */
+                                            n_ifmap_pixel_read - padding_left -
+                                                padding_right /* n_ifmap_pixel_read
+                                                               */
+                                            /* repetitions */);
+                                    snrt_dma_wait_all();
+                                }
+                            }
+
+                        }
+
+                        // Transfer tile from other cluster to memory
+                        else {
+                            // A cluster always loads from the previous cluster
+                            // uint32_t cluster_offset = 0x00040000; # TODO: already defined
+                            volatile uint32_t *src_synch_flag =
+                                (void *)synch_flag - cluster_offset;
+                            double *src_ifmap = (void *)ifmap - cluster_offset;
+
+                            // Wait until previous cluster has released data
+                            if (l->cluster2cluster &&
+                                (cluster_id % cluster_per_quadrant) != 0) {
+                                while (src_synch_flag[!write_buf] == 0)
+                                    ;
+                            }
+
+                            // Transfer in FH * (compute_num + FW - 1) pixels
+                            // such that im2col transformation can be performed
+                            // for every core
+                            snrt_dma_txid_t ifmap_txid = snrt_dma_start_1d(
+                                &ifmap[write_buf * ifmap_stride],
+                                &src_ifmap[!write_buf * ifmap_stride],
+                                sizeof(double) * n_ifmap_pixel_read *
+                                    l->TILE_CI * l->FH);
+                            snrt_dma_wait_all();
+
+                            // clear synch flag of src cluster
+                            if (l->cluster2cluster &&
+                                (cluster_id % cluster_per_quadrant) != 0) {
+                                // printf("Cluster %d clearing synch flag %p\n",
+                                // cluster_id, &src_synch_flag[!write_buf]);
+                                src_synch_flag[!write_buf] = 0;
+                            }
+                        }
+
+                        snrt_dma_stop_tracking();
+
+                        // New data is produced
+                        if (l->cluster2cluster) {
+                            synch_flag[write_buf] = 1;
+                            // printf("Cluster %d setting synch flag %p\n",
+                            // cluster_id, &synch_flag[write_buf]);
+                        }
+
+                        snrt_dma_start_tracking();
+
+                        // Reshuffle and write data to the im2col buffer by the
+                        // DMA
+                        for (uint32_t n = 0; n < compute_num; n++) {
+                            // only construct im2col matrix for leftover pixels
+                            if (ow + n < l->OW) {
+                                snrt_dma_txid_t im2col_txid = snrt_dma_start_2d(
+                                    &im2col[write_buf * im2col_mat_stride +
+                                            n * im2col_row_stride], /* dst */
+                                    &ifmap[read_buf * ifmap_stride +
+                                           n * ifmap_col_stride], /* src */
+                                    sizeof(double) * l->FW *
+                                        l->TILE_CI, /* size */
+                                    sizeof(double) * l->FW *
+                                        l->TILE_CI, /* dst_stride */
+                                    sizeof(double) * (compute_num + l->FW - 1) *
+                                        l->TILE_CI, /* src_stride */
+                                    l->FH /* repetitions */);
+                            }
+                        }
+
+                        // Wait for im2col transform to end, and synchronize
+                        // with compute cores
+                        snrt_dma_wait_all();
+                        snrt_dma_stop_tracking();
+                        snrt_cluster_hw_barrier();
+                        benchmark_get_cycle();
+
+                        // Transfer back the output feature maps
+                        if (oh_prev + ow_prev >= 0) {
+                            snrt_dma_txid_t ofmap_txid = snrt_dma_start_2d(
+                                &l->ofmap[(oh_prev * l->OW + ow_prev) * l->CO +
+                                          co],                    /* dst */
+                                &ofmap[!read_buf * ofmap_stride], /* src */
+                                sizeof(double) * 8,               /* size */
+                                sizeof(double) * l->CO, /* dst_stride */
+                                sizeof(double) * 8,     /* src_stride */
+                                n_ofmap_pixel_write);   /* repetitions */
+                            snrt_dma_wait_all();
+                        }
+                        oh_prev = oh;
+                        ow_prev = ow;
+
+                        // Toggle write and read buffer
+                        write_buf = !write_buf;
+                        read_buf = !read_buf;
+                    }
+
+                    if (snrt_is_compute_core()) {
+                        // Wait until DMA core has finished the im2col transform
+                        benchmark_get_cycle();
+                        snrt_cluster_hw_barrier();
+                        benchmark_get_cycle();
+
+                        // Each core performs a matrix multiplication on the
+                        // im2col buffer Of size (1 x FHxFWxCI) x (FHxFWxCI x
+                        // 8), 8 represents CO and is the unrolling factor
+                        // needed to prevent RAW conflicts.
+                        if (ow + compute_id < l->OW) {
+                            uint32_t setup_SSR =
+                                (ci == 0 && ow == 0 && _oh == 0) ? 1 : 0;
+
+                            if (ci != 0 && l->TILE_CI != l->CI) {
+                                const uint32_t alpha = 0;
+                                gemm_fp64_opt(
+                                    1, 8, l->FH * l->FW * l->TILE_CI,
+                                    &im2col[read_buf * im2col_mat_stride +
+                                            compute_id * im2col_row_stride],
+                                    0, 0, weights,
+                                    l->FH * l->FW * l->TILE_CI + 1, 1,
+                                    &ofmap[write_buf * ofmap_stride +
+                                           compute_id * ofmap_co_stride],
+                                    0, &alpha, setup_SSR);
+
+                            } else {
+                                const uint32_t alpha = 1;
+                                gemm_fp64_opt(
+                                    1, 8, l->FH * l->FW * l->TILE_CI,
+                                    &im2col[read_buf * im2col_mat_stride +
+                                            compute_id * im2col_row_stride],
+                                    0, 0, weights,
+                                    l->FH * l->FW * l->TILE_CI + 1, 1,
+                                    &ofmap[write_buf * ofmap_stride +
+                                           compute_id * ofmap_co_stride],
+                                    0, &alpha, setup_SSR);
+                            }
+                        }
+                        // Toggle read and write buffer
+                        read_buf = !read_buf;
+                        write_buf = !write_buf;
+                    }
+                }
+            }
+
+            snrt_cluster_hw_barrier();
+
+            // Transfer back last output tile
+            if (snrt_is_dm_core()) {
+                snrt_dma_txid_t ofmap_txid = snrt_dma_start_2d(
+                    &l->ofmap[(oh_prev * l->OW + ow_prev) * l->CO +
+                              co],                      /* dst */
+                    &ofmap[!read_buf * ofmap_stride],   /* src */
+                    sizeof(double) * 8,                 /* size */
+                    sizeof(double) * l->CO,             /* dst_stride */
+                    sizeof(double) * 8,                 /* src_stride */
+                    min(compute_num, l->OW - ow_prev)); /* repetitions */
+                snrt_dma_wait_all();
+            }
+        }
+    }
+
+    // snrt_global_barrier();
+}
diff --git a/target/sim/sw/device/libraries/snDNN/src/gelu_layer.c b/target/sim/sw/device/libraries/snDNN/src/gelu_layer.c
new file mode 100644
index 00000000..f45c2849
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/gelu_layer.c
@@ -0,0 +1,69 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "gelu_layer.h"
+
+#include "gelu.h"
+#include "layer.h"
+// #include "printf.h"
+#include "snrt.h"
+
+void gelu_layer(const gelu_layer_t *l) {
+    uint32_t cluster_num = snrt_cluster_num();
+    uint32_t cluster_id = snrt_cluster_idx();
+    uint32_t compute_num = snrt_cluster_compute_core_num();
+    uint32_t compute_id = snrt_cluster_compute_core_num();
+
+    uint32_t ifmap_size =
+        l->BATCH_SIZE * l->SEQ_LEN * l->HIDDEN_NODES * sizeof(float);
+    uint32_t ofmap_size = ifmap_size;
+
+    void *ptr = (float *)snrt_l1_next();
+    float *ifmap = ptr;
+    ptr += ifmap_size;
+    float *ofmap = ptr;
+    ptr += ofmap_size;
+
+    // DMA transfer the ifmap into the cluster TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
+            ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
+            l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
+            l->SEQ_LEN * l->HIDDEN_NODES * sizeof(float));
+
+        snrt_dma_wait_all();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    if (snrt_is_compute_core() &&
+        snrt_cluster_compute_core_num() < compute_num) {
+        // determine the row offset for each core
+        int32_t row_offset = compute_id * l->HIDDEN_NODES;
+
+        // determine the row stride of each matrix
+        int32_t ldI = compute_num * l->HIDDEN_NODES;
+
+        // determine the batch offset for each core
+        int32_t batch_offset = l->SEQ_LEN * l->HIDDEN_NODES;
+
+        // printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
+
+        for (int b = 0; b < l->BATCH_SIZE; b++) {
+            // if (compute_id == 1) {
+            //     printf("BATCH: %d\n", b);
+            // }
+            gelu_fp32(&ifmap[row_offset + b * batch_offset],
+                      &ofmap[row_offset + b * batch_offset], ldI, l->BATCH_SIZE,
+                      l->SEQ_LEN / 8, l->HIDDEN_NODES);
+        }
+
+        snrt_cluster_hw_barrier();
+
+    } else {
+        snrt_cluster_hw_barrier();
+    }
+
+    snrt_global_barrier();
+}
\ No newline at end of file
diff --git a/target/sim/sw/device/libraries/snDNN/src/gemm_layer.c b/target/sim/sw/device/libraries/snDNN/src/gemm_layer.c
new file mode 100644
index 00000000..e69de29b
diff --git a/target/sim/sw/device/libraries/snDNN/src/layernorm_layer.c b/target/sim/sw/device/libraries/snDNN/src/layernorm_layer.c
new file mode 100644
index 00000000..3a1c63df
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/layernorm_layer.c
@@ -0,0 +1,60 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "layernorm_layer.h"
+
+#include "layer.h"
+#include "layernorm.h"
+// #include "printf.h"
+#include "snrt.h"
+
+void layernorm_layer(const layernorm_layer_t *l) {
+    uint32_t cluster_num = snrt_cluster_num();
+    uint32_t cluster_id = snrt_cluster_idx();
+    uint32_t compute_num = snrt_cluster_compute_core_num();
+    uint32_t compute_id = snrt_global_core_idx();
+
+    uint32_t ifmap_size =
+        l->BATCH_SIZE * l->SEQ_LEN * l->EMBEDDINGS * sizeof(float);
+    uint32_t ofmap_size = ifmap_size;
+
+    void *ptr = (float *)snrt_l1_next();
+    float *ifmap = ptr;
+    ptr += ifmap_size;
+    float *ofmap = ptr;
+    ptr += ofmap_size;
+
+    // DMA transfer the ifmap into the cluster TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
+            ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
+            l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
+            l->SEQ_LEN * l->EMBEDDINGS * sizeof(float));
+
+        snrt_dma_wait_all();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    if (snrt_is_compute_core()) {
+        // determine the row offset for each core
+        int32_t row_offset = compute_id * l->EMBEDDINGS;
+
+        // determine the row stride of each matrix
+        int32_t ldI = compute_num * l->EMBEDDINGS;
+
+        // determine the batch offset for each core
+        int32_t batch_offset = l->SEQ_LEN * l->EMBEDDINGS;
+
+        // printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
+        layernorm_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI,
+                       batch_offset, l->BATCH_SIZE, l->SEQ_LEN / 8,
+                       l->EMBEDDINGS, l->EPS);
+
+    } else {
+        snrt_cluster_hw_barrier();
+    }
+
+    snrt_global_barrier();
+}
\ No newline at end of file
diff --git a/target/sim/sw/device/libraries/snDNN/src/linear_layer.c b/target/sim/sw/device/libraries/snDNN/src/linear_layer.c
new file mode 100644
index 00000000..f19a74bd
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/linear_layer.c
@@ -0,0 +1,104 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "linear_layer.h"
+
+#include "layer.h"
+#include "linear.h"
+// #include "printf.h"
+#include "snrt.h"
+
+void linear_layer(const linear_layer_t *l) {
+    uint32_t cluster_num = snrt_cluster_num();
+    uint32_t cluster_id = snrt_cluster_idx();
+    uint32_t compute_num = snrt_cluster_compute_core_num();
+    uint32_t compute_id = snrt_cluster_compute_core_num();
+
+    uint32_t ifmap_size = l->CH * l->CW * sizeof(float);
+    uint32_t weights_size = l->CO * l->CI * sizeof(float);
+    uint32_t bias_size = l->CO * sizeof(float);
+    uint32_t ofmap_size = l->CH * l->CO * sizeof(float);
+
+    void *ptr = (float *)snrt_l1_next();
+    float *ifmap = ptr;
+    ptr += ifmap_size;
+    float *weights = ptr;
+    ptr += weights_size;
+    float *bias = ptr;
+    ptr += bias_size;
+    float *ofmap = ptr;
+    ptr += ofmap_size;
+    float *result = ptr;
+    ptr += ofmap_size;
+
+    // now we DMA transfer the weights and bias into the cluster TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_txid_t txid_bias = snrt_dma_start_1d(bias, l->bias, bias_size);
+        snrt_dma_txid_t txid_weights = snrt_dma_start_2d(
+            weights, l->weights, l->CO * sizeof(float), l->CO * sizeof(float),
+            l->CO * sizeof(float), l->CI * sizeof(float));
+
+        snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
+            ifmap, l->ifmap, l->CH * sizeof(float), l->CH * sizeof(float),
+            l->CH * sizeof(float), l->CW * sizeof(float));
+
+        snrt_dma_wait_all();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    if (snrt_is_compute_core() &&
+        snrt_cluster_compute_core_num() < compute_num) {
+        // determine the row stride of each matrix
+        int32_t ldI = l->CH * l->CW;
+        int32_t ldW = compute_num * l->CO;
+        int32_t ldB = compute_num;
+        int32_t ldO = ldB;
+
+        // determine the row offset of each matrix
+        int32_t offW = compute_id * l->CO;
+        int32_t offB = compute_id;
+        int32_t offO = compute_id;
+
+        // printf("compute_id = %d, offW = %d, offB = %d, offO = %d\n",
+        //         compute_id, offW, offB, offO);
+
+        linear_fp32(ifmap, ldI, &weights[offW], ldW, &bias[compute_id], ldB,
+                    ofmap, ldO, l->CI, l->CO / compute_num, l->CH);
+
+    } else {
+        snrt_cluster_hw_barrier();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    if (snrt_is_dm_core()) {
+        snrt_dma_txid_t txid_result = snrt_dma_start_2d(
+            result, l->result, l->CH * sizeof(float), l->CH * sizeof(float),
+            l->CH * sizeof(float), l->CO * sizeof(float));
+        snrt_dma_wait_all();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    // TODO: fix this, wrong values for ofmap printed
+    if (compute_id == 0) {
+        // compare result with ofmap
+        float tolerance = 1e-6;
+        int error = 0;
+        for (int i = 0; i < l->CH; i++) {
+            for (int j = 0; j < l->CO; j++) {
+                if (result[i * l->CO + j] - ofmap[i * l->CO + j] > tolerance) {
+                    printf(
+                        "MISMATCH: result[%d][%d] = %f, ofmap[%d][%d] = %f\n",
+                        i, j, result[i * l->CO + j], i, j,
+                        ofmap[i * l->CO + j]);
+                    error += 1;
+                }
+            }
+        }
+
+        printf("[%d/%d] mismatches\n", error, l->CH * l->CO);
+    }
+}
\ No newline at end of file
diff --git a/target/sim/sw/device/libraries/snDNN/src/maxpool_layer.c b/target/sim/sw/device/libraries/snDNN/src/maxpool_layer.c
new file mode 100644
index 00000000..56183741
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/maxpool_layer.c
@@ -0,0 +1,115 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "maxpool_layer.h"
+
+#include "layer.h"
+#include "maxpool.h"
+// #include "printf.h"
+#include "snrt.h"
+
+void maxpool_layer(const conv_layer *l) {
+    uint32_t cluster_num = snrt_cluster_num();
+    uint32_t cluster_id = snrt_cluster_idx();
+    uint32_t compute_num = snrt_cluster_compute_core_num();
+    uint32_t compute_id = snrt_global_core_idx();
+
+    // Each cluster loads one tile of kernel size
+    uint32_t ifmap_size = 2 * l->FH * l->FW * l->TILE_CI;
+    uint32_t ofmap_size = 2 * l->TILE_CI;
+
+    double *ptr = (double *)snrt_l1_next();
+    double *ifmap = ptr;
+    ptr += ifmap_size;
+    double *ofmap = ptr;
+    ptr += ofmap_size;
+
+    uint32_t read_buf = 0;
+    uint32_t write_buf = 0;
+
+    uint32_t prev_oh;
+    uint32_t prev_ow;
+    uint32_t prev_ci;
+
+    // tiles are distributed across clusters
+    for (uint32_t tile = cluster_id; tile < l->OH * l->OW;
+         tile += cluster_num) {
+        for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) {
+            uint32_t oh = tile / l->OW;
+            uint32_t ow = tile % l->OW;
+
+            if (snrt_is_dm_core()) {
+                for (uint32_t fh = 0; fh < l->FH; fh++) {
+                    if (l->TILE_CI == l->CI) {
+                        snrt_dma_start_1d(
+                            &ifmap[write_buf * (ifmap_size / 2) +
+                                   fh * l->FW * l->TILE_CI], /* dst */
+                            &l->ifmap[((oh * l->FH + fh) * l->IW + ow * l->FW) *
+                                      l->CI], /* src */
+                            sizeof(double) * l->TILE_CI * l->FW /* size */);
+                    } else {
+                        // printf("bubu\n");
+                        snrt_dma_start_2d(
+                            &ifmap[write_buf * (ifmap_size / 2) +
+                                   fh * l->FW * l->TILE_CI], /* dst */
+                            &l->ifmap[((oh * l->FH + fh) * l->IW + ow * l->FW) *
+                                          l->CI +
+                                      ci],               /* src */
+                            sizeof(double) * l->TILE_CI, /* size */
+                            sizeof(double) * l->TILE_CI, /* dst_stride */
+                            sizeof(double) * l->CI,      /* src_stride */
+                            l->FW /* repetitions */);
+                    }
+                }
+                snrt_dma_wait_all();
+
+                // synchronize with compute cores after loading data
+                snrt_cluster_hw_barrier();
+
+                if (!(tile == cluster_id && ci == 0)) {
+                    snrt_dma_start_2d(
+                        &l->ofmap[(prev_oh * l->OW + prev_ow) * l->CI +
+                                  prev_ci],                   /* dst */
+                        &ofmap[!read_buf * (ofmap_size / 2)], /* src */
+                        sizeof(double) * l->TILE_CI,          /* size */
+                        sizeof(double) * l->CI,               /* dst_stride */
+                        sizeof(double) * l->TILE_CI,          /* src_stride */
+                        1 /* repetitions */);
+                }
+
+                snrt_dma_wait_all();
+                write_buf = !write_buf;
+                read_buf = !read_buf;
+                prev_ci = ci;
+                prev_oh = oh;
+                prev_ow = ow;
+            }
+
+            if (snrt_is_compute_core()) {
+                // wait for data to arrive
+                snrt_cluster_hw_barrier();
+
+                maxpool_fp64(&ifmap[read_buf * ifmap_size / 2 + compute_id],
+                             &ofmap[write_buf * ofmap_size / 2 + compute_id],
+                             l->TILE_CI, l->FH, l->FW, compute_num);
+
+                write_buf = !write_buf;
+                read_buf = !read_buf;
+            }
+        }
+    }
+
+    snrt_cluster_hw_barrier();
+
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_2d(
+            &l->ofmap[(prev_oh * l->OW + prev_ow) * l->CI + prev_ci], /* dst */
+            &ofmap[!read_buf * (ofmap_size / 2)],                     /* src */
+            sizeof(double) * l->TILE_CI,                              /* size */
+            sizeof(double) * l->CI,      /* dst_stride */
+            sizeof(double) * l->TILE_CI, /* src_stride */
+            1 /* repetitions */);
+        snrt_dma_wait_all();
+    }
+}
diff --git a/target/sim/sw/device/libraries/snDNN/src/nnlinear_backend_baseline.c b/target/sim/sw/device/libraries/snDNN/src/nnlinear_backend_baseline.c
new file mode 100644
index 00000000..86413de3
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/nnlinear_backend_baseline.c
@@ -0,0 +1,275 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "nnlinear_backend_baseline.h"
+
+#include "network.h"
+#include "nnlinear_baseline.h"
+// #include "printf.h"
+#include "snrt.h"
+#include "utils.h"
+
+// define which parts of the network to run
+#define RUN_FEEDFORWARD 1
+#define RUN_GRADIENT_UPDATE 1
+#define RUN_TRAINING_STEP 1
+#define GET_ACCURACY 1
+#define GET_LOSS 1
+#define RUN_RTL 0
+#define NUM_EPOCHS 1
+#define BATCH_SIZE 1
+#define DATASET_SIZE 2  // 60000
+#define INFO 0
+
+void nnlinear_backend_baseline(const network_fp32_t *n) {
+    uint32_t cluster_num = snrt_cluster_num();  // Total number of clusters
+    uint32_t cluster_core_num =
+        snrt_cluster_core_num();               // Total cores per cluster
+    uint32_t cluster_id = snrt_cluster_idx();  // Cluster ID
+    uint32_t compute_num =
+        snrt_cluster_compute_core_num();  // Number of compute cores per cluster
+    uint32_t global_compute_num =
+        snrt_global_core_num();  // Total cores incl. DM core per cluster
+    uint32_t compute_id =
+        snrt_cluster_compute_core_num();  // Core ID of each compute core
+    uint32_t dm_id = snrt_cluster_dm_core_idx();  // DM core ID of each cluster
+    uint32_t global_compute_id =
+        snrt_global_core_idx();  // Core ID of each core on all clusters
+
+    if (INFO == 1) {
+        if (compute_id == 0 && cluster_id == 0) {
+            printf(
+                "======================== System Info "
+                "========================\n");
+            printf("Total number of clusters: %d\n", cluster_num);
+            printf("Total cores per cluster: %d\n", cluster_core_num);
+            printf("Number of compute cores per cluster: %d\n", compute_num);
+            printf("Total cores incl. DM core per cluster: %d\n",
+                   global_compute_num);
+            printf(
+                "============================================================="
+                "\n");
+        }
+    }
+
+    snrt_cluster_hw_barrier();
+
+    uint32_t weights_size = NUM_CLASSES * IN_CH * n->dtype;
+    uint32_t biases_size = NUM_CLASSES * n->dtype;
+    uint32_t activations_size = NUM_CLASSES * n->dtype;
+    uint32_t image_size = IN_CH * n->dtype;
+    uint32_t loss_size = n->dtype;
+    uint32_t labels_size = sizeof(uint32_t);
+
+    // cluster 0 variabels:
+    float *weights;
+    float *weight_grads;
+    float *biases;
+    float *bias_grads;
+    float *images;
+    float *activations;
+    float *loss;
+    uint32_t *targets;
+
+    void *tcdm_ptr = (float *)snrt_l1_next();
+
+    // cluster 0 memory map
+    weights = tcdm_ptr;
+    tcdm_ptr += weights_size;
+    weight_grads = tcdm_ptr;
+    tcdm_ptr += weights_size;
+    biases = tcdm_ptr;
+    tcdm_ptr += biases_size;
+    activations = tcdm_ptr;
+    tcdm_ptr += activations_size;
+    bias_grads = tcdm_ptr;
+    tcdm_ptr += biases_size;
+    images = tcdm_ptr;
+    tcdm_ptr += image_size;
+    loss = tcdm_ptr;
+    tcdm_ptr += loss_size;
+    targets = tcdm_ptr;
+    tcdm_ptr += labels_size;
+
+    // DRAM pointers to images and targets
+    uint32_t *images_dram = (void *)0x80040000;
+    uint32_t *targets_dram = (void *)0x80108000;
+
+    if (snrt_is_dm_core()) {
+        snrt_dma_txid_t txid_B = snrt_dma_start_1d(biases, n->b, biases_size);
+        snrt_dma_wait_all();
+        snrt_dma_txid_t txid_W =
+            snrt_dma_start_2d(weights, n->W, IN_CH * n->dtype, IN_CH * n->dtype,
+                              IN_CH * n->dtype, NUM_CLASSES);
+    }
+
+    snrt_cluster_hw_barrier();
+
+    uint32_t number_of_images = 256;
+    int correct = 0;
+    int predict = 0;
+    int epoch_count = 0;
+    float epoch_loss, epoch_acc = 0;
+    float mean_epoch_loss, mean_epoch_acc = 0;
+    float batch_acc = 0;
+    float batch_loss = 0;
+    loss[0] = 0.0f;
+
+    int batches = DATASET_SIZE / BATCH_SIZE;
+
+    for (int epoch = 0; epoch < NUM_EPOCHS; epoch++) {
+        if (INFO == 1) {
+            if (compute_id == 0 && cluster_id == 0) {
+                printf(
+                    "======================== EPOCH [%d/%d] start. "
+                    "========================\n",
+                    (epoch + 1), NUM_EPOCHS);
+            }
+        }
+        for (int batch = 0; batch < batches; batch++) {
+            batch_loss = 0;
+            batch_acc = 0;
+            correct = 0;
+            if (snrt_is_compute_core()) {
+                if (INFO == 1) {
+                    if (compute_id == 0 && cluster_id == 0) {
+                        printf(
+                            "======================== BATCH [%d/%d] start. "
+                            "========================\n",
+                            (batch + 1), batches);
+                    }
+                }
+                /* Zero out the gradients
+                 * TODO: make this more efficient!
+                 */
+                for (int i = 0; i < NUM_CLASSES; i++) {
+                    bias_grads[i] = 0;
+                    for (int j = 0; j < IN_CH; j++) {
+                        weight_grads[i * IN_CH + j] = 0;
+                    }
+                }
+
+                if (INFO == 1) {
+                    if (compute_id == 0 && cluster_id == 0) {
+                        printf("INFO: Gradients have been zeroed out.\n");
+                    }
+                }
+
+                snrt_cluster_hw_barrier();
+
+            } else if (!snrt_is_compute_core()) {
+                snrt_cluster_hw_barrier();
+            }
+            for (uint32_t image = 0; image < BATCH_SIZE; image++) {
+                uint32_t volatile curr_img =
+                    image * IN_CH + batch * BATCH_SIZE * IN_CH;
+                // printf("======================== Image %d
+                // ========================\n", curr_img / 784);
+                uint32_t volatile curr_target = image + batch * BATCH_SIZE;
+                if (snrt_is_dm_core()) {
+                    float img_checksum = 0;
+                    snrt_dma_start_tracking();
+                    snrt_dma_txid_t txid_img =
+                        snrt_dma_start_1d(images,  // destination
+                                          &images_dram[curr_img],  // source
+                                          n->dtype * IN_CH);       // size
+                    snrt_dma_wait_all();
+                    snrt_dma_txid_t txid_target =
+                        snrt_dma_start_1d(targets,  // destination
+                                          &targets_dram[curr_target],  // source
+                                          sizeof(uint32_t));           // size
+                    snrt_dma_wait_all();
+                }
+
+                snrt_cluster_hw_barrier();
+
+                if (snrt_is_compute_core() &&
+                    snrt_cluster_compute_core_num() < compute_num) {
+                    GradientUpdate_baseline(images, activations, biases,
+                                            weights, weight_grads, bias_grads,
+                                            targets[0], loss);
+                    snrt_cluster_hw_barrier();
+                    batch_loss += *loss;
+                    /* Accuracy Calculation */
+                    float max_activation = activations[0];
+                    predict = 0;
+                    for (int i = 0; i < NUM_CLASSES; i++) {
+                        if (max_activation < activations[i]) {
+                            max_activation = activations[i];
+                            predict = i;
+                        }
+                    }
+
+                    if (predict == targets[0]) {
+                        correct++;
+                    }
+                    snrt_cluster_hw_barrier();
+
+                    // printf("pred = %d, target = %d\n", predict, targets[0]);
+
+                } else if (!snrt_is_compute_core()) {
+                    snrt_cluster_hw_barrier();
+                    snrt_cluster_hw_barrier();
+                    snrt_cluster_hw_barrier();
+                    snrt_cluster_hw_barrier();
+                }
+            }
+
+            snrt_cluster_hw_barrier();
+
+            // After one epoch we update the weights
+            if (snrt_is_compute_core() &&
+                snrt_cluster_compute_core_num() < compute_num) {
+                batch_acc = (float)correct / (float)BATCH_SIZE;
+                epoch_acc += batch_acc;
+                epoch_loss += batch_loss / BATCH_SIZE;
+                if (INFO == 1) {
+                    if (compute_id == 0 && cluster_id == 0) {
+                        printf(
+                            "A total of [%d/%d] images were predicted "
+                            "correctly in "
+                            "batch %d\n",
+                            correct, BATCH_SIZE, batch + 1);
+                        printf("batch acc = %.6f\n", batch_acc * 100);
+                        printf("batch loss = %.6f\n", batch_loss / BATCH_SIZE);
+                    }
+                }
+
+                TrainingStep_baseline(biases, weights, weight_grads, bias_grads,
+                                      n->learning_rate);
+
+                if (batch % (batches - 1) == 0 && batch != 0) {
+                    epoch_count++;
+                    mean_epoch_loss = epoch_loss / batches;
+                    mean_epoch_acc = epoch_acc / batches;
+                    if (INFO == 1) {
+                        if (compute_id == 0 && cluster_id == 0) {
+                            printf(
+                                "===========================  EPOCH %u done. "
+                                "===========================\n",
+                                epoch_count);
+                            printf(
+                                "===========================  Epoch  Acc %.3f  "
+                                "===========================\n",
+                                mean_epoch_acc * 100);
+                            printf(
+                                "===========================  Epoch  Loss %.3f "
+                                " "
+                                "===========================\n",
+                                mean_epoch_loss);
+                        }
+                    }
+                    epoch_loss = 0;
+                    epoch_acc = 0;
+                }
+
+            } else if (!snrt_is_compute_core()) {
+                snrt_cluster_hw_barrier();
+            }
+
+            snrt_cluster_hw_barrier();
+        }
+    }
+    snrt_global_barrier();
+}
\ No newline at end of file
diff --git a/target/sim/sw/device/libraries/snDNN/src/sndnn.c b/target/sim/sw/device/libraries/snDNN/src/sndnn.c
new file mode 100644
index 00000000..c48d057c
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/sndnn.c
@@ -0,0 +1,20 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sndnn.h"
+
+#include "batchnorm.c"
+#include "batchnorm_layer.c"
+#include "conv2d.c"
+#include "conv2d_layer.c"
+#include "gelu_layer.c"
+#include "gemm.c"
+#include "layernorm_layer.c"
+#include "linear_layer.c"
+#include "maxpool.c"
+#include "maxpool_layer.c"
+#include "snrt.h"
+#include "utils.c"
+// #include "nnlinear_backend_baseline.c"
+// #include "softmax_layer.c"
diff --git a/target/sim/sw/device/libraries/snDNN/src/sndnn.h b/target/sim/sw/device/libraries/snDNN/src/sndnn.h
new file mode 100644
index 00000000..1d0775b7
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/sndnn.h
@@ -0,0 +1,29 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+// // Snitch cluster specific
+// #include "snitch_cluster_defs.h"
+
+#include "batchnorm.h"
+#include "batchnorm_layer.h"
+#include "conv2d.h"
+#include "conv2d_layer.h"
+#include "gelu.h"
+#include "gelu_layer.h"
+#include "gemm.h"
+#include "layer.h"
+#include "layernorm.h"
+#include "layernorm_layer.h"
+#include "linear.h"
+#include "linear_layer.h"
+#include "maxpool_layer.h"
+#include "network.h"
+#include "utils.h"
+// #include "nnlinear_backend_baseline.h"
+// #include "softmax_layer.h"
diff --git a/target/sim/sw/device/libraries/snDNN/src/softmax_layer.c b/target/sim/sw/device/libraries/snDNN/src/softmax_layer.c
new file mode 100644
index 00000000..8dcbe592
--- /dev/null
+++ b/target/sim/sw/device/libraries/snDNN/src/softmax_layer.c
@@ -0,0 +1,60 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// #include "softmax_layer.h"
+
+#include "layer.h"
+// #include "printf.h"
+#include "sndnn.h"
+#include "snrt.h"
+#include "softmax.h"
+
+void softmax_layer(softmax_layer_t *const l) {
+    uint32_t cluster_num = snrt_cluster_num();
+    uint32_t cluster_id = snrt_cluster_idx();
+    uint32_t compute_num = snrt_cluster_compute_core_num();
+    uint32_t compute_id = snrt_global_core_idx();
+
+    uint32_t ifmap_size =
+        l->BATCH_SIZE * l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float);
+    uint32_t ofmap_size = ifmap_size;
+
+    void *ptr = (float *)snrt_l1_next();
+    float *ifmap = ptr;
+    ptr += ifmap_size;
+    float *ofmap = ptr;
+    ptr += ofmap_size;
+
+    // DMA transfer the ifmap into the cluster TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
+            ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
+            l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
+            l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float));
+
+        snrt_dma_wait_all();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    if (snrt_is_compute_core()) {
+        // determine the row offset for each core
+        int32_t row_offset = compute_id * l->INPUT_SAMPLES;
+
+        // determine the row stride of each matrix
+        int32_t ldI = compute_num * l->INPUT_SAMPLES;
+
+        // determine the batch offset for each core
+        int32_t batch_offset = l->SEQ_LEN * l->INPUT_SAMPLES;
+
+        // printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
+        softmax_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI, batch_offset,
+                     l->BATCH_SIZE, l->SEQ_LEN / 8, l->INPUT_SAMPLES);
+
+    } else {
+        snrt_cluster_hw_barrier();
+    }
+
+    snrt_global_barrier();
+}
\ No newline at end of file
diff --git a/target/sim/sw/host/Makefile b/target/sim/sw/host/Makefile
index 21728704..b4b177db 100644
--- a/target/sim/sw/host/Makefile
+++ b/target/sim/sw/host/Makefile
@@ -7,6 +7,7 @@
 # Add user applications to APPS variable
 APPS  = hello_world
 APPS += offload
+APPS += offload_general
 
 TARGET ?= all
 
diff --git a/target/sim/sw/host/apps/common_general.mk b/target/sim/sw/host/apps/common_general.mk
index cbd7136a..a500bbe2 100644
--- a/target/sim/sw/host/apps/common_general.mk
+++ b/target/sim/sw/host/apps/common_general.mk
@@ -4,6 +4,10 @@
 #
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
+# Usage of absolute paths is required to externally include
+# this Makefile from multiple different locations
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+
 ######################
 # Invocation options #
 ######################
@@ -21,11 +25,16 @@ RISCV_OBJDUMP = riscv64-unknown-elf-objdump
 RISCV_READELF = riscv64-unknown-elf-readelf
 
 # Directories
-BUILDDIR    = $(abspath build)
-HOST_DIR    = $(abspath ../../)
+BUILDDIRS   = $(foreach LIB, $(LIBS), $(abspath $(MK_DIR)/$(APP)/build/$(LIB)))
+BUILDDIR    = $(abspath $(MK_DIR)/$(APP)/build)
+HOST_DIR    = $(abspath $(MK_DIR)/../)
 RUNTIME_DIR = $(abspath $(HOST_DIR)/runtime)
 DEVICE_DIR  = $(abspath $(HOST_DIR)/../device)
 
+# Library names
+LIBS = sndnn
+LIBS += blas
+
 # Dependencies
 INCDIRS += $(RUNTIME_DIR)
 INCDIRS += $(HOST_DIR)/../shared/platform/generated
@@ -61,17 +70,19 @@ RISCV_LDFLAGS += -T$(LINKER_SCRIPT)
 
 # Device binary
 DEVICE_BUILDDIR = $(foreach DEVICE_APP, $(DEVICE_APPS), $(DEVICE_DIR)/apps/$(DEVICE_APP)/build)
-DEVICE_BINARY   = $(foreach DEVICE_APP, $(DEVICE_APPS), $(DEVICE_DIR)/apps/$(DEVICE_APP)/build/$(DEVICE_APP).bin)
+DEVICE_BINARY   = $(foreach DEVICE_APP, $(DEVICE_APPS), $(DEVICE_DIR)/apps/$(DEVICE_APP)/build/$(basename $(notdir $(DEVICE_APP))).bin)
 ORIGIN_LD       = $(foreach DEVICE_APP, $(DEVICE_APPS), $(DEVICE_DIR)/apps/$(DEVICE_APP)/build/origin.ld)
 
 ###########
 # Outputs #
 ###########
 
-PARTIAL_ELF     = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(APP).part.elf))
+PARTIAL_ELF     = $(abspath $(BUILDDIR)/$(APP).part.elf)
+# ELF             = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(basename $(notdir $(DEVICE_APP))).elf))
+# DEP             = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(basename $(notdir $(DEVICE_APP))).d))
 ELF             = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(DEVICE_APP).elf))
 DEP             = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(DEVICE_APP).d))
-PARTIAL_DUMP    = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(APP).part.dump))
+PARTIAL_DUMP    = $(abspath $(BUILDDIR)/$(APP).part.dump)
 DUMP            = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(DEVICE_APP).dump))
 DWARF           = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(DEVICE_APP).dwarf))
 PARTIAL_OUTPUTS = $(PARTIAL_ELF) $(PARTIAL_DUMP) $(ORIGIN_LD)
@@ -89,24 +100,24 @@ finalize-build: $(FINAL_OUTPUTS)
 
 .PHONY: clean
 clean:
-	rm -rf $(BUILDDIR)
+	rm -rf $(BUILDDIRS)
 	rm -f $(OFFSET_LD)
 
-$(BUILDDIR):
+$(BUILDDIRS):
 	mkdir -p $@
 
 $(DEVICE_BUILDDIR):
 	mkdir -p $@
 
-$(DEP): $(SRCS) | $(BUILDDIR)
+$(DEP): $(SRCS) | $(BUILDDIRS)
 	$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(PARTIAL_ELF)' $< > $@
 	$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(ELF)' $< >> $@
 
 # Partially linked object
-$(PARTIAL_ELF): $(DEP) $(LD_SRCS) | $(BUILDDIR)
+$(PARTIAL_ELF): $(DEP) $(LD_SRCS) | $(BUILDDIRS)
 	$(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@
 
-$(PARTIAL_DUMP): $(PARTIAL_ELF) | $(BUILDDIR)
+$(PARTIAL_DUMP): $(PARTIAL_ELF) | $(BUILDDIRS)
 	$(RISCV_OBJDUMP) -D $< > $@
 
 # Device object relocation address
@@ -115,14 +126,17 @@ $(ORIGIN_LD): $(PARTIAL_ELF) | $(DEVICE_BUILDDIR)
 	echo "Writing device object relocation address 0x$$RELOC_ADDR to $@"; \
 	echo "L3_ORIGIN = 0x$$RELOC_ADDR;" > $@
 
-$(ELF): $(DEP) $(LD_SRCS) $(DEVICE_BINARY) | $(BUILDDIR)
-	$(eval FINAL_CFLAGS := -DDEVICEBIN=\"$(DEVICE_DIR)/apps/$(CUR_APP_NAME)/build/$(CUR_APP_NAME).bin\")
+$(ELF): $(DEP) $(LD_SRCS) $(DEVICE_BINARY) | $(BUILDDIRS)
+	$(eval CUR_APP_NAME:=$(basename $(notdir $@)))
+	$(eval CUR_DIR_NAME:=$(basename $(basename $(dir $@))))
+	$(eval CUR_LIB=$(strip $(foreach LIB,$(LIBS),$(findstring $(LIB),$(CUR_DIR_NAME)))))
+	$(eval FINAL_CFLAGS := -DDEVICEBIN=\"$(DEVICE_DIR)/apps/$(CUR_LIB)/$(CUR_APP_NAME)/build/$(CUR_APP_NAME).bin\")
 	$(RISCV_CC) $(RISCV_CFLAGS) $(FINAL_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@
 
-$(DUMP): $(ELF) | $(BUILDDIR)
+$(DUMP): $(ELF) | $(BUILDDIRS)
 	$(RISCV_OBJDUMP) -D $< > $@
 
-$(DWARF): $(ELF) | $(BUILDDIR)
+$(DWARF): $(ELF) | $(BUILDDIRS)
 	$(RISCV_READELF) --debug-dump $< > $@
 
 ifneq ($(MAKECMDGOALS),clean)
diff --git a/target/sim/sw/host/apps/offload_general/Makefile b/target/sim/sw/host/apps/offload_general/Makefile
new file mode 100644
index 00000000..67164604
--- /dev/null
+++ b/target/sim/sw/host/apps/offload_general/Makefile
@@ -0,0 +1,16 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP  = offload_general
+SRCS = src/offload_general.c
+# INCL_DEVICE_BINARY = true
+
+DEVICE_APPS = sndnn/gemm
+# DEVICE_APPS = gemm
+DEVICE_APPS += axpy
+# DEVICE_APPS += blas/gemm
+
+include ../common_general.mk
diff --git a/target/sim/sw/host/apps/offload_general/layout.csv b/target/sim/sw/host/apps/offload_general/layout.csv
new file mode 100644
index 00000000..c3e3cedd
--- /dev/null
+++ b/target/sim/sw/host/apps/offload_general/layout.csv
@@ -0,0 +1,4 @@
+,prepare data,send interrupt,clr interrupt,get local job ptr,barrier,copy job in,copy data in,get args,barrier,compute,barrier,copy output,send interrupt,clr interrupt
+0,1,2,,,,,,,,,,,,4
+"range(1,9)",,,1,2,3,,,4,5,6,7,,,
+9,,,1,2,,3,4,,5,,6,7,8,
diff --git a/target/sim/sw/host/apps/offload_general/src/offload_general.c b/target/sim/sw/host/apps/offload_general/src/offload_general.c
new file mode 100644
index 00000000..0405426a
--- /dev/null
+++ b/target/sim/sw/host/apps/offload_general/src/offload_general.c
@@ -0,0 +1,49 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// #include "offload_general.h"
+
+#include "host.c"
+
+// Other variables
+// __thread volatile comm_buffer_t* comm_buffer;
+
+#define N_JOBS 1
+
+int main() {
+    // Reset and ungate quadrant 0, deisolate
+    reset_and_ungate_quad(0);
+    deisolate_quad(0, ISO_MASK_ALL);
+
+    // Enable interrupts to receive notice of job termination
+    enable_sw_interrupts();
+
+    // Program Snitch entry point and communication buffer
+    program_snitches();
+
+    // Wakeup Snitches for snRuntime initialization
+    wakeup_snitches_cl();
+
+    int32_t snitch_return_value = -1;
+
+    // Wait for snRuntime initialization to be over
+    wait_snitches_done();
+
+    // Send jobs
+    // for (int i = 0; i < N_JOBS; i++) {
+        // Start Snitches
+        // mcycle();
+        wakeup_snitches_cl();
+
+        // Wait for job done
+        wait_sw_interrupt();
+        // Clear interrupt
+        clear_sw_interrupt(0);
+        // wait_snitches_done();
+        snitch_return_value = ((int32_t)comm_buffer.usr_data_ptr);
+    // }
+    // Exit routine
+    // mcycle();
+    return snitch_return_value;
+}