Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rtl cleanups and optimizations #843

Merged
merged 9 commits into from
Mar 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ mimpid = 0x01040312 -> Version 01.04.03.12 -> v1.4.3.12

| Date | Version | Comment | Link |
|:----:|:-------:|:--------|:----:|
| 10.03.2024 | 1.9.6.2 | minor rtl clean-ups, optimizations and fixes | [#843](https://github.com/stnolting/neorv32/pull/843) |
| 09.03.2024 | 1.9.6.1 | add generic cache module (not used yet) | [#842](https://github.com/stnolting/neorv32/pull/842) |
| 01.03.2024 | [**:rocket:1.9.6**](https://github.com/stnolting/neorv32/releases/tag/v1.9.6) | **New release** | |
| 25.02.2024 | 1.9.5.10 | :bug: fix minor GPTMR threshold configuration issue | [#834](https://github.com/stnolting/neorv32/pull/834) |
Expand Down
200 changes: 112 additions & 88 deletions rtl/core/neorv32_cache.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ begin

-- Check if Direct/Uncached Access --------------------------------------------------------
-- -------------------------------------------------------------------------------------------
dir_acc_d <= '1' when (host_req_i.addr(31 downto 28) = UC_BEGIN) or -- uncached memory page
(host_req_i.rvso = '1') else '0'; -- atomic )reservation set) operation
dir_acc_d <= '1' when (host_req_i.addr(31 downto 28) >= UC_BEGIN) or -- uncached memory page
(host_req_i.rvso = '1') else '0'; -- atomic (reservation set) operation

-- request splitter: cached or direct access --
req_splitter: process(host_req_i, dir_acc_d)
Expand Down Expand Up @@ -294,7 +294,7 @@ begin
cache_in <= cache_in_host when (cmd_busy = '0') else cache_in_bus;


-- Bus Access Arbiter (Handle Cache Miss and Flush/Reload)---------------------------------
-- Bus Access Arbiter (Handle Cache Miss and Flush/Reload) --------------------------------
-- -------------------------------------------------------------------------------------------
neorv32_cache_bus_inst: neorv32_cache_bus
generic map (
Expand Down Expand Up @@ -328,10 +328,26 @@ begin
rdata_i => cache_out.rdata -- read data
);

-- simple bus multiplexer (as there won't be simultaneous access requests) --
bus_req_o <= bus_req when (cmd_busy = '1') else dir_req_q;
dir_rsp_d <= bus_rsp_i;
bus_rsp <= bus_rsp_i;

-- Bus Access Switch ----------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
-- Use a real switch here to buffer direct access requests
-- during out-of-band cache operation (fence / cache flush).
neorv32_cache_bus_switch: entity neorv32.neorv32_bus_switch
generic map (
PORT_A_READ_ONLY => false,
PORT_B_READ_ONLY => false
)
port map (
clk_i => clk_i,
rstn_i => rstn_i,
a_req_i => bus_req,
a_rsp_o => bus_rsp,
b_req_i => dir_req_q,
b_rsp_o => dir_rsp_d,
x_req_o => bus_req_o,
x_rsp_i => bus_rsp_i
);


end neorv32_cache_rtl;
Expand Down Expand Up @@ -810,58 +826,71 @@ end neorv32_cache_bus;
architecture neorv32_cache_bus_rtl of neorv32_cache_bus is

-- cache layout --
constant offset_size_c : natural := index_size_f(BLOCK_SIZE);
constant offset_size_c : natural := index_size_f(BLOCK_SIZE/4); -- WORD offset!
constant index_size_c : natural := index_size_f(NUM_BLOCKS);
constant tag_lsb_c : natural := index_size_c + offset_size_c;
constant tag_size_c : natural := 32 - (offset_size_c + index_size_c + 2);

-- host request buffer --
signal hreq : bus_req_t;

-- control engine --
type ctrl_state_t is (S_IDLE, S_CHECK_PRE, S_CHECK, S_DOWNLOAD_REQ, S_DOWNLOAD_RSP,
S_UPLOAD_GET, S_UPLOAD_REQ, S_UPLOAD_RSP, S_FLUSH_0, S_FLUSH_1, S_FLUSH_2);
type ctrl_t is record
state, state_nxt : ctrl_state_t; -- FSM state
upret, upret_nxt : ctrl_state_t; -- upload-done return state
addr, addr_nxt : std_ulogic_vector(31 downto 0); -- address generator
bcnt, bcnt_nxt : std_ulogic_vector(index_size_c-1 downto 0); -- block counter
-- control fsm --
type state_t is (S_IDLE, S_CHECK, S_DOWNLOAD_REQ, S_DOWNLOAD_RSP, S_UPLOAD_GET, S_UPLOAD_REQ, S_UPLOAD_RSP, S_FLUSH_START, S_FLUSH_READ, S_FLUSH_CHECK);
signal state, upret, state_nxt, upret_nxt: state_t;

-- address generator --
type addr_t is record
tag : std_ulogic_vector(tag_size_c-1 downto 0);
ind : std_ulogic_vector(index_size_c-1 downto 0);
off : std_ulogic_vector(offset_size_c-1 downto 0); -- WORD offset!
end record;
signal ctrl : ctrl_t;
signal haddr, baddr, addr, addr_nxt : addr_t;

begin

-- Address Decomposition ------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
-- base address of original host access --
haddr.tag <= host_req_i.addr(31 downto (32-tag_size_c));
haddr.ind <= host_req_i.addr((offset_size_c+2+index_size_c)-1 downto offset_size_c+2);
haddr.off <= (others => '0'); -- unused

-- base address of indexed cache block --
baddr.tag <= base_i(31 downto (32-tag_size_c));
baddr.ind <= base_i((offset_size_c+2+index_size_c)-1 downto offset_size_c+2);
baddr.off <= (others => '0'); -- unused


-- Control Engine FSM Sync ----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
ctrl_engine_sync: process(rstn_i, clk_i)
begin
if (rstn_i = '0') then
ctrl.state <= S_IDLE;
ctrl.upret <= S_IDLE;
ctrl.addr <= (others => '0');
ctrl.bcnt <= (others => '0');
hreq <= req_terminate_c;
state <= S_IDLE;
upret <= S_IDLE;
addr.tag <= (others => '0');
addr.ind <= (others => '0');
addr.off <= (others => '0');
hreq <= req_terminate_c;
elsif rising_edge(clk_i) then
ctrl.state <= ctrl.state_nxt;
ctrl.upret <= ctrl.upret_nxt;
ctrl.addr <= ctrl.addr_nxt;
ctrl.bcnt <= ctrl.bcnt_nxt;
hreq <= host_req_i;
state <= state_nxt;
upret <= upret_nxt;
addr <= addr_nxt;
hreq <= host_req_i;
end if;
end process ctrl_engine_sync;


-- Control Engine FSM Comb ----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
ctrl_engine_comb: process(ctrl, hreq, host_req_i, bus_rsp_i, cmd_sync_i, cmd_miss_i, rdata_i, dirty_i, base_i)
ctrl_engine_comb: process(state, upret, addr, hreq, haddr, baddr, bus_rsp_i, cmd_sync_i, cmd_miss_i, rdata_i, dirty_i)
begin
-- control defaults --
ctrl.state_nxt <= ctrl.state;
ctrl.upret_nxt <= ctrl.upret;
ctrl.addr_nxt <= ctrl.addr;
ctrl.bcnt_nxt <= ctrl.bcnt;
-- control engine defaults --
state_nxt <= state;
upret_nxt <= upret;
addr_nxt <= addr;

-- cache defaults --
addr_o <= ctrl.addr;
addr_o <= addr.tag & addr.ind & addr.off & "00"; -- always word-aligned
we_o <= (others => '0');
swe_o <= '0';
wdata_o <= bus_rsp_i.data;
Expand All @@ -873,46 +902,42 @@ begin

-- bus interface defaults --
bus_req_o <= req_terminate_c; -- all-zero
bus_req_o.addr <= ctrl.addr(31 downto 2) & "00"; -- always word-aligned
bus_req_o.addr <= addr.tag & addr.ind & addr.off & "00"; -- always word-aligned
bus_req_o.data <= rdata_i;
bus_req_o.ben <= (others => '1'); -- full-word writes only
bus_req_o.priv <= hreq.priv; -- keep original privilege level

-- fsm --
case ctrl.state is
case state is

when S_IDLE => -- wait for request
-- ------------------------------------------------------------
ctrl.addr_nxt(offset_size_c-1 downto 0) <= (others => '0'); -- align block base address
ctrl.bcnt_nxt <= (others => '0'); -- reset block counter
addr_nxt.off <= (others => '0'); -- align block base address for upload/download (and flush)
if (cmd_sync_i = '1') then -- cache sync
ctrl.state_nxt <= S_FLUSH_0;
state_nxt <= S_FLUSH_START;
elsif (cmd_miss_i = '1') then -- cache miss
ctrl.addr_nxt(31 downto offset_size_c) <= host_req_i.addr(31 downto offset_size_c); -- buffer original tag + index for cache look-up
ctrl.state_nxt <= S_CHECK_PRE;
state_nxt <= S_CHECK;
end if;

when S_CHECK_PRE => -- cache memory access latency
-- ------------------------------------------------------------
ctrl.state_nxt <= S_CHECK;

when S_CHECK => -- check if accessed block is dirty
when S_CHECK => -- check if accessed block is dirty (cache address is still applied by host controller!)
-- ------------------------------------------------------------
ctrl.upret_nxt <= S_DOWNLOAD_REQ; -- go straight to S_DOWNLOAD_REQ after S_UPLOAD_GET is completed (if executed)
upret_nxt <= S_DOWNLOAD_REQ; -- go straight to S_DOWNLOAD_REQ when S_UPLOAD_GET has completed (if executed)
if (dirty_i = '1') then -- block is dirty, upload first
ctrl.addr_nxt(31 downto offset_size_c) <= base_i(31 downto offset_size_c); -- base address of accessed block
ctrl.state_nxt <= S_UPLOAD_GET;
addr_nxt.tag <= baddr.tag; -- base address (tag + index) of accessed block
addr_nxt.ind <= baddr.ind;
state_nxt <= S_UPLOAD_GET;
else -- block is clean, download new block and override
ctrl.addr_nxt(31 downto offset_size_c) <= host_req_i.addr(31 downto offset_size_c); -- base address of requested block
ctrl.state_nxt <= S_DOWNLOAD_REQ;
addr_nxt.tag <= haddr.tag; -- base address (tag + index) of requested block
addr_nxt.ind <= haddr.ind;
state_nxt <= S_DOWNLOAD_REQ;
end if;


when S_DOWNLOAD_REQ => -- download new cache block: request new word
-- ------------------------------------------------------------
bus_req_o.rw <= '0'; -- read access
bus_req_o.stb <= '1'; -- request new transfer
ctrl.state_nxt <= S_DOWNLOAD_RSP;
bus_req_o.rw <= '0'; -- read access
bus_req_o.stb <= '1'; -- request new transfer
state_nxt <= S_DOWNLOAD_RSP;

when S_DOWNLOAD_RSP => -- download new cache block: wait for bus response
-- ------------------------------------------------------------
Expand All @@ -921,77 +946,76 @@ begin
swe_o <= '1'; -- cache: write status bit (bus error response)
new_o <= '1'; -- set new block (set tag, make valid, make clean)
if (bus_rsp_i.ack = '1') or (bus_rsp_i.err = '1') then -- wait for response
ctrl.addr_nxt(offset_size_c-1 downto 2) <= std_ulogic_vector(unsigned(ctrl.addr(offset_size_c-1 downto 2)) + 1);
if (and_reduce_f(ctrl.addr(offset_size_c-1 downto 2)) = '1') then -- block completed? offset will be all-zero again after block completion
ctrl.state_nxt <= S_IDLE;
addr_nxt.off <= std_ulogic_vector(unsigned(addr.off) + 1);
if (and_reduce_f(addr.off) = '1') then -- block completed? offset will be all-zero again after block completion
state_nxt <= S_IDLE;
else -- get next word
ctrl.state_nxt <= S_DOWNLOAD_REQ;
state_nxt <= S_DOWNLOAD_REQ;
end if;
end if;


when S_UPLOAD_GET => -- upload dirty cache block: read word from cache
-- ------------------------------------------------------------
bus_req_o.rw <= '1'; -- write access
ctrl.state_nxt <= S_UPLOAD_REQ;
bus_req_o.rw <= '1'; -- write access
state_nxt <= S_UPLOAD_REQ;

when S_UPLOAD_REQ => -- upload dirty cache block: request bus write
-- ------------------------------------------------------------
bus_req_o.rw <= '1'; -- write access
bus_req_o.stb <= '1'; -- request new transfer
ctrl.state_nxt <= S_UPLOAD_RSP;
bus_req_o.rw <= '1'; -- write access
bus_req_o.stb <= '1'; -- request new transfer
state_nxt <= S_UPLOAD_RSP;

when S_UPLOAD_RSP => -- upload dirty cache block: wait for bus response
-- ------------------------------------------------------------
bus_req_o.rw <= '1'; -- write access
new_o <= '1'; -- set new block (set tag, make valid, make clean)
if (bus_rsp_i.ack = '1') or (bus_rsp_i.err = '1') then -- wait for response
ctrl.addr_nxt(offset_size_c-1 downto 2) <= std_ulogic_vector(unsigned(ctrl.addr(offset_size_c-1 downto 2)) + 1);
if (and_reduce_f(ctrl.addr(offset_size_c-1 downto 2)) = '1') then -- block completed? offset will be all-zero again after block completion
ctrl.state_nxt <= ctrl.upret; -- go back to "upload-done return state"
addr_nxt.off <= std_ulogic_vector(unsigned(addr.off) + 1);
if (and_reduce_f(addr.off) = '1') then -- block completed? offset will be all-zero again after block completion
state_nxt <= upret; -- go back to "upload-done return state"
else -- get next word
ctrl.state_nxt <= S_UPLOAD_GET;
state_nxt <= S_UPLOAD_GET;
end if;
end if;


when S_FLUSH_0 => -- cache access latency cycle
when S_FLUSH_START => -- start checking for dirty blocks
-- ------------------------------------------------------------
ctrl.addr_nxt(tag_lsb_c-1 downto offset_size_c) <= ctrl.bcnt; -- current block to check if dirty
ctrl.state_nxt <= S_FLUSH_1;
addr_nxt.ind <= (others => '0'); -- start with index 0
upret_nxt <= S_FLUSH_CHECK; -- come back to S_FLUSH_CHECK after block upload
state_nxt <= S_FLUSH_READ;

when S_FLUSH_1 => -- sync. cache memory read latency cycle
when S_FLUSH_READ => -- cache read access latency cycle
-- ------------------------------------------------------------
ctrl.state_nxt <= S_FLUSH_2;
state_nxt <= S_FLUSH_CHECK;

when S_FLUSH_2 => -- check if currently indexed block is dirty
when S_FLUSH_CHECK => -- check if currently indexed block is dirty
-- ------------------------------------------------------------
ctrl.upret_nxt <= S_FLUSH_2; -- come back here after upload
inval_o <= '1'; -- invalidate currently checked block
ctrl.addr_nxt(31 downto offset_size_c) <= base_i(31 downto offset_size_c); -- tag + index of currently checked block
-- check if dirty / upload required --
if (dirty_i = '1') then -- upload dirty block to main memory
ctrl.state_nxt <= S_UPLOAD_GET;
addr_nxt.tag <= baddr.tag; -- tag of currently index block
inval_o <= '1'; -- invalidate currently index block
if (dirty_i = '1') then -- block dirty?
state_nxt <= S_UPLOAD_GET;
else -- move on to next block
ctrl.bcnt_nxt <= std_ulogic_vector(unsigned(ctrl.bcnt) + 1);
if (and_reduce_f(ctrl.bcnt) = '1') then -- all blocks done?
bus_req_o.fence <= '1'; -- forward fence (sync) to downstream memories
ctrl.state_nxt <= S_IDLE;
addr_nxt.ind <= std_ulogic_vector(unsigned(addr.ind) + 1);
if (and_reduce_f(addr.ind) = '1') then -- all blocks done?
bus_req_o.fence <= '1'; -- forward fence request to downstream memories
state_nxt <= S_IDLE;
else -- go to next block
ctrl.state_nxt <= S_FLUSH_0;
state_nxt <= S_FLUSH_READ;
end if;
end if;


when others => -- undefined
-- ------------------------------------------------------------
ctrl.state_nxt <= S_IDLE;
state_nxt <= S_IDLE;

end case;
end process ctrl_engine_comb;

-- bus arbiter operation in progress --
cmd_busy_o <= '0' when (ctrl.state = S_IDLE) else '1';
-- bus arbiter operation in progress (host keeps allying cache address while bud unit reports idle state) --
cmd_busy_o <= '0' when (state = S_IDLE) or (state = S_CHECK) else '1';


end neorv32_cache_bus_rtl;
12 changes: 6 additions & 6 deletions rtl/core/neorv32_cpu_control.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,9 @@ begin

when IF_REQUEST => -- request next 32-bit-aligned instruction word
-- ------------------------------------------------------------
if (ipb.free = "11") then -- wait for free IPB space
if (ipb.free = "11") then -- free IPB space?
fetch_engine.state <= IF_PENDING;
elsif (fetch_engine.restart = '1') or (fetch_engine.reset = '1') then -- restart request due to branch
elsif (fetch_engine.restart = '1') or (fetch_engine.reset = '1') then -- restart because of branch
fetch_engine.state <= IF_RESTART;
end if;

Expand Down Expand Up @@ -849,14 +849,14 @@ begin
-- state machine --
case execute_engine.state is

when DISPATCH => -- Wait for ISSUE ENGINE to emit valid instruction word
when DISPATCH => -- Wait for ISSUE ENGINE to emit a valid instruction word
-- ------------------------------------------------------------
if (trap_ctrl.env_pending = '1') or (trap_ctrl.exc_fire = '1') then -- pending trap or pending exception (fast)
execute_engine.state_nxt <= TRAP_ENTER;
elsif (CPU_EXTENSION_RISCV_Sdtrig = true) and (hw_trigger_match = '1') then -- hardware breakpoint
execute_engine.pc_we <= '1'; -- pc <= next_pc
trap_ctrl.hwtrig <= '1';
execute_engine.state_nxt <= TRAP_ENTER;
execute_engine.state_nxt <= DISPATCH; -- stay here another round until trap_ctrl.hwtrig arrives in trap_ctrl.env_pending
elsif (issue_engine.valid(0) = '1') or (issue_engine.valid(1) = '1') then -- new instruction word available
issue_engine.ack <= '1';
trap_ctrl.instr_be <= issue_engine.data(32); -- access fault during instruction fetch
Expand Down Expand Up @@ -1413,7 +1413,7 @@ begin
-- break point --
if (CPU_EXTENSION_RISCV_Sdext = true) then
trap_ctrl.exc_buf(exc_ebreak_c) <= (not trap_ctrl.env_enter) and (trap_ctrl.exc_buf(exc_ebreak_c) or
(trap_ctrl.hwtrig and (not csr.tdata1_action)) or -- trigger module fires and enter-debug is disabled
(trap_ctrl.hwtrig and (not csr.tdata1_action)) or -- trigger module fires and enter-debug-action is disabled
(trap_ctrl.ebreak and ( csr.privilege) and (not csr.dcsr_ebreakm) and (not debug_ctrl.running)) or -- enter M-mode handler on ebreak in M-mode
(trap_ctrl.ebreak and (not csr.privilege) and (not csr.dcsr_ebreaku) and (not debug_ctrl.running))); -- enter M-mode handler on ebreak in U-mode
else
Expand Down Expand Up @@ -1550,7 +1550,7 @@ begin
trap_ctrl.env_pending <= '0';
elsif rising_edge(clk_i) then
if (trap_ctrl.env_pending = '0') then -- no pending trap environment yet
-- trigger IRQ only in EXECUTE states to *continue execution* even if there are permanent interrupt requests
-- trigger IRQ only in EXECUTE state --
if (trap_ctrl.exc_fire = '1') or ((trap_ctrl.irq_fire = '1') and (execute_engine.state = EXECUTE)) then
trap_ctrl.env_pending <= '1'; -- now execute engine can start trap handling
end if;
Expand Down
6 changes: 0 additions & 6 deletions rtl/core/neorv32_cpu_cp_fpu.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -287,12 +287,6 @@ architecture neorv32_cpu_cp_fpu_rtl of neorv32_cpu_cp_fpu is

begin

-- Sanity Checks --------------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
assert false report
"[NEORV32] The floating-point unit (Zfinx ISA extension) is still in experimental state." severity warning;


-- ****************************************************************************************************************************
-- Control
-- ****************************************************************************************************************************
Expand Down