Skip to content

Commit

Permalink
Merge pull request #275 from riscv-boom/dcache3
Browse files Browse the repository at this point in the history
[lsu] LSU Refactor w. general improvments
  • Loading branch information
jerryz123 committed Aug 26, 2019
2 parents 781b68a + 64eea09 commit ac28f02
Show file tree
Hide file tree
Showing 31 changed files with 3,713 additions and 2,115 deletions.
42 changes: 21 additions & 21 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- run:
name: Building riscv-tools
command: |
.circleci/build-riscv-tools.sh
no_output_timeout: 120m
- save_cache:
key: riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
key: riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
paths:
- "/home/riscvuser/riscv-tools-install"
prepare-build-environment:
Expand Down Expand Up @@ -76,7 +76,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- chipyard-v12-{{ checksum "CHIPYARD.hash" }}
Expand Down Expand Up @@ -104,7 +104,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- chipyard-v12-{{ checksum "CHIPYARD.hash" }}
Expand Down Expand Up @@ -132,7 +132,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- chipyard-v12-{{ checksum "CHIPYARD.hash" }}
Expand Down Expand Up @@ -160,7 +160,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- chipyard-v12-{{ checksum "CHIPYARD.hash" }}
Expand All @@ -170,7 +170,7 @@ jobs:
- run:
name: Building MegaBoomConfig using Verilator
command: .circleci/do-rtl-build.sh megaboom
no_output_timeout: 120m
no_output_timeout: 240m
- save_cache:
key: megaboomconfig-{{ .Branch }}-{{ .Revision }}
paths:
Expand All @@ -188,7 +188,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- chipyard-v12-{{ checksum "CHIPYARD.hash" }}
Expand Down Expand Up @@ -216,7 +216,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- chipyard-v12-{{ checksum "CHIPYARD.hash" }}
Expand All @@ -241,7 +241,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- smallboomconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -262,7 +262,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- smallboomconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -282,7 +282,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- mediumboomconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -303,7 +303,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- mediumboomconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -323,7 +323,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- largeboomconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -344,7 +344,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- largeboomconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -364,7 +364,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- megaboomconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -385,7 +385,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- megaboomconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -406,7 +406,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- smallboomandrocketconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -427,7 +427,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- smallboomandrocketconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -447,7 +447,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- smallrv32unifiedboomconfig-{{ .Branch }}-{{ .Revision }}
Expand All @@ -468,7 +468,7 @@ jobs:
- checkout
- restore_cache:
keys:
- riscv-tools-installed-v3-{{ checksum "CHIPYARD.hash" }}
- riscv-tools-installed-v4-{{ checksum "CHIPYARD.hash" }}
- restore_cache:
keys:
- smallrv32unifiedboomconfig-{{ .Branch }}-{{ .Revision }}
Expand Down
3 changes: 3 additions & 0 deletions .circleci/do-rtl-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ set -ex
SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
source $SCRIPT_DIR/defaults.sh

rm -rf $LOCAL_CHIPYARD_DIR/generators/boom/*
mv -f $LOCAL_CHECKOUT_DIR/* $LOCAL_CHIPYARD_DIR/generators/boom/

# call clean on exit
trap clean EXIT

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ lib
*#
*~
project/build.properties
.nfs*
2 changes: 1 addition & 1 deletion CHIPYARD.hash
Original file line number Diff line number Diff line change
@@ -1 +1 @@
89b312a8891a4df855d82d4accdfb0db20613d65
9844fcf43bcfc3590891971934ebdfcf29bad00d
80 changes: 35 additions & 45 deletions docs/sections/load-store-unit.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,35 @@ The Load/Store Unit (LSU)
The Load/Store Unit

The Load/Store Unit is responsible for deciding when to fire memory
operations to the memory system. There are three queues: the Load
Address Queue (LAQ), the Store Address Queue (SAQ), and the Store Data
Queue (SDQ). Load instructions generate a “uopLD" Micro-Op. When issued,
“uopLD" calculates the load address and places its result in the LAQ.
Store instructions (may) generate *two* Micro-Ops, “uopSTA" (Store
Address Generation) and “uopSTD" (Store Data Generation). The STA
Micro-Op calculates the store address and places its result in the SAQ
queue. The STD Micro-Op moves the store data from the register file to
the SDQ. Each of these Micro-Ops will issue out of the *Issue Window* as
soon their operands are ready. See :ref:`Store Micro-Ops` for more
details on the store Micro-Op specifics.
operations to the memory system. There are two queues: the Load
Queue (LDQ), and the Store Queue (STQ). Load instructions generate a
“uopLD" Micro-Op. When issued, "uopLD" calculates the load address and
places its result in the LDQ. Store instructions (may) generate *two*
Micro-Ops, “uopSTA" (Store Address Generation) and “uopSTD" (Store Data
Generation). The STA Micro-Op calculates the store address and places its
result in the SAQ queue. The STD Micro-Op moves the store data from the
register file to the SDQ. Each of these Micro-Ops will issue out of the
*Issue Window* as soon their operands are ready. See :ref:`Store Micro-Ops`
for more details on the store Micro-Op specifics.

Store Instructions
------------------

Entries in the Store Queue [1]_ are allocated in the *Decode* stage (the
appropriate bit in the stq\_entry\_val vector is set). A “valid" bit
denotes when an entry in the SAQ or SDQ holds a valid address or data
(saq\_val and sdq\_val respectively). Once a store instruction is
committed, the corresponding entry in the Store Queue is marked as
committed. The store is then free to be fired to the memory system at
its convenience. Stores are fired to the memory in program order.
Entries in the Store Queue are allocated in the *Decode* stage (
stq(i).valid is set). A “valid" bit denotes when an entry in the SAQ or
SDQ holds a valid address or data (stq(i).bits.addr.valid and stq(i).bits.data.valid).
Once a store instruction is committed, the corresponding entry in the Store
Queue is marked as committed. The store is then free to be fired to the
memory system at its convenience. Stores are fired to the memory in program
order.

Store Micro-Ops
~~~~~~~~~~~~~~~

Stores are inserted into the issue window as a single instruction (as
opposed to being broken up into separate addr-gen and data-gen
Micro-Ops). This prevents wasteful usage of the expensive issue window
entries and extra contention on the issue port to the LSU. A store in
entries and extra contention on the issue ports to the LSU. A store in
which both operands are ready can be issued to the LSU as a single
Micro-Op which provides both the address and the data to the LSU. While
this requires store instructions to have access to two register file
Expand All @@ -55,14 +54,14 @@ ready.
Load Instructions
-----------------

Entries in the Load Queue (LAQ) are allocated in the *Decode* stage
(laq\_entry\_val). In *Decode*, each load entry is also given a *store
mask* (laq\_st\_mask), which marks which stores in the Store Queue the
given load depends on. When a store is fired to memory and leaves the
Store Queue, the appropriate bit in the *store mask* is cleared.
Entries in the Load Queue (LDQ) are allocated in the *Decode* stage
(ldq(i).valid). In *Decode*, each load entry is also given a *store
mask* (ldq(i).bits.st\_dep\_mask), which marks which stores in the Store
Queue the given load depends on. When a store is fired to memory and
leaves the Store Queue, the appropriate bit in the *store mask* is cleared.

Once a load address has been computed and placed in the LAQ, the
corresponding *valid* bit is set (laq\_val).
Once a load address has been computed and placed in the LDQ, the
corresponding *valid* bit is set (ldq(i).addr.valid).

Loads are optimistically fired to memory on arrival to the LSU (getting
loads fired early is a huge benefit of out–of–order pipelines).
Expand All @@ -72,35 +71,30 @@ request is killed. If the corresponding store data is present, then the
store data is *forwarded* to the load and the load marks itself as
having *succeeded*. If the store data is not present, then the load goes
to *sleep*. Loads that have been put to sleep are retried at a later
time. [2]_
time. [1]_

The BOOM Memory Model
---------------------

Currently, as of October 2016, the RISC-V memory model is underspecified
and will take some time to settle on an exact specification. However,
the current RISC-V specification describes as *relaxed consistency*
model in which
stores and loads may be freely re-ordered.
BOOM follows the RVWMO memory consistency model.

BOOM currently exhibits the following behavior:

#. Write -> Read constraint is relaxed (newer loads may execute before
older stores).

#. Read -> Read constraint is currently relaxed (loads to the same address
may be reordered).
#. Read -> Read constraint is maintained (loads to the same address
appear in order).

#. A thread can read its own writes early.

Ordering Loads to the Same Address
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The RISC-V memory model is expected to strengthen the requirement that
loads to the same address be ordered. [3]_ This requires loads to search
against other loads for potential address conflicts. If a younger load
executes before an older load with a matching address, the younger load
must be replayed and the instructions after it in the pipeline flushed.
The RISC-V WMO memory model requires that loads to the same address be ordered.
[2]_ This requires loads to search against other loads for potential address conflicts.
If a younger load executes before an older load with a matching address, the
younger load must be replayed and the instructions after it in the pipeline flushed.
However, this scenario is only required if a cache coherence probe event
snooped the core’s memory, exposing the reordering to the other threads.
If no probe events occurred, the load re-ordering may safely occur.
Expand All @@ -124,22 +118,18 @@ ordering failure, the pipeline must be flushed and the Rename Map Tables
reset. This is an incredibly expensive operation.

To discover ordering failures, when a store commits, it checks the
entire LAQ for any address matches. If there is a match, the store
entire LDQ for any address matches. If there is a match, the store
checks to see if the load has *executed*, and if it got its data from
memory or if the data was forwarded from an older store. In either case,
a memory ordering failure has occurred.

See :numref:`lsu` for more information about the Load/Store Unit.

.. [1]
When I refer to the *Store Queue*, I really mean both the SAQ and
SDQ.
.. [2]
Higher-performance processors will track *why* a load was put to
sleep and wake it up once the blocking cause has been alleviated.
.. [3]
.. [2]
Technically, a *fence.r.r* could be used to provide the correct
execution of software on machines that reorder dependent loads.
However, there are two reasons for an ISA to disallow re-ordering of
Expand Down
1 change: 1 addition & 0 deletions src/main/scala/common/configs.scala
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class TracedSmallBoomConfig extends Config(

//RV32IMAC TODO: Support FP
class SmallRV32UnifiedBoomConfig extends Config(
new WithBoomRV32 ++
new WithoutBoomFPU ++
new WithUnifiedMemIntIQs ++
new SmallBoomConfig)
Expand Down
7 changes: 4 additions & 3 deletions src/main/scala/common/consts.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ trait BOOMDebugConstants
{
val DEBUG_PRINTF = false // use the Chisel printf functionality
val COMMIT_LOG_PRINTF = false // dump commit state, for comparision against ISA sim
val MEMTRACE_PRINTF = false // dump trace of memory accesses to L1D for debugging
val O3PIPEVIEW_PRINTF = false // dump trace for O3PipeView from gem5
val O3_CYCLE_TIME = (1000)// "cycle" time expected by o3pipeview.py

Expand Down Expand Up @@ -290,8 +291,8 @@ trait ScalarOpConstants
uop.uopc := uopNOP // maybe not required, but helps on asserts that try to catch spurious behavior
uop.bypassable := false.B
uop.fp_val := false.B
uop.is_store := false.B
uop.is_load := false.B
uop.uses_stq := false.B
uop.uses_ldq := false.B
uop.pdst := 0.U
uop.dst_rtype := RT_X
// TODO these unnecessary? used in regread stage?
Expand Down Expand Up @@ -339,7 +340,7 @@ trait RISCVConstants
// memory consistency model
// The C/C++ atomics MCM requires that two loads to the same address maintain program order.
// The Cortex A9 does NOT enforce load/load ordering (which leads to buggy behavior).
val MCM_ORDER_DEPENDENT_LOADS = false
val MCM_ORDER_DEPENDENT_LOADS = true

val jal_opc = (0x6f).U
val jalr_opc = (0x67).U
Expand Down

0 comments on commit ac28f02

Please sign in to comment.