Skip to content

Commit

Permalink
Merge pull request #366 from riscv-boom/dcache3-prefetch
Browse files Browse the repository at this point in the history
DCache prefetching + multi-issue
  • Loading branch information
jerryz123 committed Sep 16, 2019
2 parents cae8808 + 51b39c5 commit f6ffb9c
Show file tree
Hide file tree
Showing 18 changed files with 2,849 additions and 1,268 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ jobs:
- run:
name: Building MegaBoomConfig using Verilator
command: .circleci/do-rtl-build.sh megaboom
no_output_timeout: 120m
no_output_timeout: 240m
- save_cache:
key: megaboomconfig-{{ .Branch }}-{{ .Revision }}
paths:
Expand Down
2 changes: 1 addition & 1 deletion .circleci/do-rtl-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ copy $LOCAL_VERILATOR_DIR/ $SERVER:$REMOTE_VERILATOR_DIR

# enter the verilator directory and build the specific config on remote server
run "make -C $REMOTE_SIM_DIR clean"
run "export RISCV=\"$REMOTE_RISCV_DIR\"; export VERILATOR_ROOT=$REMOTE_VERILATOR_DIR/install/share/verilator; make -j$NPROC -C $REMOTE_SIM_DIR VERILATOR_INSTALL_DIR=$REMOTE_VERILATOR_DIR JAVA_ARGS=\"-Xmx8G -Xss8M\" ${mapping[$1]}"
run "export RISCV=\"$REMOTE_RISCV_DIR\"; export VERILATOR_ROOT=$REMOTE_VERILATOR_DIR/install/share/verilator; make -j$NPROC -C $REMOTE_SIM_DIR VERILATOR_INSTALL_DIR=$REMOTE_VERILATOR_DIR JAVA_ARGS=\"-Xmx16G -Xss8M\" ${mapping[$1]}"
run "rm -rf $REMOTE_CHIPYARD_DIR/project"

# copy back the final build
Expand Down
6 changes: 4 additions & 2 deletions src/main/scala/common/config-mixins.scala
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ class WithMegaBooms extends Config((site, here, up) => {
decodeWidth = 4,
numRobEntries = 128,
issueParams = Seq(
IssueParams(issueWidth=1, numEntries=32, iqType=IQT_MEM.litValue, dispatchWidth=4),
IssueParams(issueWidth=2, numEntries=32, iqType=IQT_MEM.litValue, dispatchWidth=4),
IssueParams(issueWidth=3, numEntries=32, iqType=IQT_INT.litValue, dispatchWidth=4),
IssueParams(issueWidth=2, numEntries=32, iqType=IQT_FP.litValue , dispatchWidth=4)),
numIntPhysRegisters = 128,
Expand All @@ -293,6 +293,8 @@ class WithMegaBooms extends Config((site, here, up) => {
numStqEntries = 32,
maxBrCount = 16,
numFetchBufferEntries = 32,
enablePrefetching=true,
numDCacheBanks=2,
ftq = FtqParameters(nEntries=32),
btb = BoomBTBParameters(btbsa=true, densebtb=false, nSets=512, nWays=4, nRAS=16, tagSz=20),
bpdBaseOnly = None,
Expand All @@ -302,7 +304,7 @@ class WithMegaBooms extends Config((site, here, up) => {
fpu = Some(freechips.rocketchip.tile.FPUParams(sfmaLatency=4, dfmaLatency=4, divSqrt=true))),
dcache = Some(DCacheParams(rowBits = site(SystemBusKey).beatBytes*8,
nSets=64, nWays=8, nMSHRs=8, nTLBEntries=32)),
icache = Some(ICacheParams(fetchBytes = 4*4, rowBits = site(SystemBusKey).beatBytes*8, nSets=64, nWays=8))
icache = Some(ICacheParams(fetchBytes = 4*4, rowBits = site(SystemBusKey).beatBytes*8, nSets=64, nWays=8, prefetch=true))
)}
case SystemBusKey => up(SystemBusKey, site).copy(beatBytes = 16)
case XLen => 64
Expand Down
5 changes: 5 additions & 0 deletions src/main/scala/common/parameters.scala
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ case class BoomCoreParams(
nPerfCounters: Int = 0,
numRXQEntries: Int = 4,
numRCQEntries: Int = 8,
numDCacheBanks: Int = 1,
/* more stuff */

useFetchMonitor: Boolean = true,
Expand Down Expand Up @@ -197,6 +198,9 @@ trait HasBoomCoreParameters extends freechips.rocketchip.tile.HasCoreParameters
require (issueParams.count(_.iqType == IQT_MEM.litValue) == 1 || usingUnifiedMemIntIQs)
require (issueParams.count(_.iqType == IQT_INT.litValue) == 1)

val intWidth = issueParams.find(_.iqType == IQT_INT.litValue).get.issueWidth
val memWidth = if (usingUnifiedMemIntIQs) 1 else issueParams.find(_.iqType == IQT_MEM.litValue).get.issueWidth

issueParams.map(x => require(x.dispatchWidth <= coreWidth && x.dispatchWidth > 0))

//************************************
Expand All @@ -209,6 +213,7 @@ trait HasBoomCoreParameters extends freechips.rocketchip.tile.HasCoreParameters

val enableFastLoadUse = boomParams.enableFastLoadUse
val enablePrefetching = boomParams.enablePrefetching
val nLBEntries = dcacheParams.nMSHRs

//************************************
// Branch Prediction
Expand Down
112 changes: 78 additions & 34 deletions src/main/scala/exu/core.scala
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class BoomCore(implicit p: Parameters) extends BoomModule
// ********************************************************
// Clear fp_pipeline before use
if (usingFPU) {
fp_pipeline.io.ll_wport := DontCare
fp_pipeline.io.ll_wports := DontCare
fp_pipeline.io.wb_valids := DontCare
fp_pipeline.io.wb_pdsts := DontCare
}
Expand All @@ -96,7 +96,7 @@ class BoomCore(implicit p: Parameters) extends BoomModule
val numFastWakeupPorts = exe_units.count(_.bypassable)
val numAlwaysBypassable = exe_units.count(_.alwaysBypassable)

val numIntIssueWakeupPorts = numIrfWritePorts + 1 + numFastWakeupPorts - numAlwaysBypassable // + 1 for ll_wb
val numIntIssueWakeupPorts = numIrfWritePorts + memWidth + numFastWakeupPorts - numAlwaysBypassable // + memWidth for ll_wb
val numIntRenameWakeupPorts = if (enableFastWakeupsToRename) numIntIssueWakeupPorts else numIrfWritePorts + 1
val numFpWakeupPorts = if (usingFPU) fp_pipeline.io.wakeups.length else 0

Expand All @@ -114,17 +114,20 @@ class BoomCore(implicit p: Parameters) extends BoomModule
Module(new RegisterFileSeqCustomArray(
numIntPhysRegs,
numIrfReadPorts,
numIrfWritePorts + 1, // + 1 for ll writebacks
numIrfWritePorts + memWidth, // + memWidth for ll writebacks
xLen,
Seq(true) ++ exe_units.bypassable_write_port_mask)) // 0th is bypassable ll_wb
Seq.fill(memWidth) {true} ++ exe_units.bypassable_write_port_mask)) // bypassable ll_wb
} else {
Module(new RegisterFileSynthesizable(
numIntPhysRegs,
numIrfReadPorts,
numIrfWritePorts + 1, // + 1 for ll writebacks
numIrfWritePorts + memWidth, // + memWidth for ll writebacks
xLen,
Seq(true) ++ exe_units.bypassable_write_port_mask)) // 0th is bypassable ll_wb
Seq.fill(memWidth) {true} ++ exe_units.bypassable_write_port_mask)) // bypassable ll_wb
}

// wb arbiter for the 0th ll writeback
// TODO: should this be a multi-arb?
val ll_wbarb = Module(new Arbiter(new ExeUnitResp(xLen), 1 +
(if (usingFPU) 1 else 0) +
(if (usingRoCC) 1 else 0)))
Expand All @@ -136,7 +139,7 @@ class BoomCore(implicit p: Parameters) extends BoomModule
exe_units.numTotalBypassPorts,
xLen))
val rob = Module(new Rob(
numIrfWritePorts + 1 + numFpWakeupPorts, // +1 for ll writebacks
numIrfWritePorts + memWidth + numFpWakeupPorts, // +memWidth for ll writebacks
numFpWakeupPorts))
// Used to wakeup registers in rename and issue. ROB needs to listen to something else.
val int_iss_wakeups = Wire(Vec(numIntIssueWakeupPorts, Valid(new ExeUnitResp(xLen))))
Expand Down Expand Up @@ -189,7 +192,11 @@ class BoomCore(implicit p: Parameters) extends BoomModule
}

// Load/Store Unit & ExeUnits
exe_units.memory_unit.io.lsu_io <> io.lsu.exe
val mem_units = exe_units.memory_units
val mem_resps = mem_units.map(_.io.ll_iresp)
for (i <- 0 until memWidth) {
mem_units(i).io.lsu_io <> io.lsu.exe(i)
}

//-------------------------------------------------------------
// Uarch Hardware Performance Events (HPEs)
Expand Down Expand Up @@ -308,11 +315,17 @@ class BoomCore(implicit p: Parameters) extends BoomModule

// SFence needs access to the PC to inject an address into the TLB's CAM port. The ROB
// will have to later redirect the PC back to the regularly scheduled program.
io.ifu.sfence_take_pc := io.lsu.exe.req.bits.sfence.valid
io.ifu.sfence_addr := io.lsu.exe.req.bits.sfence.bits.addr
io.ifu.sfence_take_pc := false.B
io.ifu.sfence_addr := DontCare
for (i <- 0 until memWidth) {
when (io.lsu.exe(i).req.bits.sfence.valid) {
io.ifu.sfence_take_pc := true.B
io.ifu.sfence_addr := io.lsu.exe(i).req.bits.sfence.bits.addr
}
}

// We must redirect the PC the cycle after playing the SFENCE game.
io.ifu.flush_take_pc := rob.io.flush.valid || RegNext(io.lsu.exe.req.bits.sfence.valid)
io.ifu.flush_take_pc := rob.io.flush.valid || RegNext(io.ifu.sfence_take_pc)

// TODO FIX THIS HACK
// The below code works because of two quirks with the flush mechanism
Expand Down Expand Up @@ -343,7 +356,14 @@ class BoomCore(implicit p: Parameters) extends BoomModule
(br_unit.brinfo.mispredict && br_unit.brinfo.is_jr && csr.io.status.debug)

// Delay sfence to match pushing the sfence.addr into the TLB's CAM port.
io.ifu.sfence := RegNext(io.lsu.exe.req.bits.sfence)
io.ifu.sfence.valid := false.B
io.ifu.sfence.bits := DontCare
for (i <- 0 until memWidth) {
when (RegNext(io.lsu.exe(i).req.bits.sfence.valid)) {
io.ifu.sfence.valid := true.B
io.ifu.sfence.bits := RegNext(io.lsu.exe(i).req.bits.sfence.bits)
}
}

//-------------------------------------------------------------
//-------------------------------------------------------------
Expand Down Expand Up @@ -637,6 +657,16 @@ class BoomCore(implicit p: Parameters) extends BoomModule
int_ren_wakeups(0).valid := ll_wbarb.io.out.fire() && ll_wbarb.io.out.bits.uop.dst_rtype === RT_FIX
int_ren_wakeups(0).bits := ll_wbarb.io.out.bits

for (i <- 1 until memWidth) {
int_iss_wakeups(i).valid := mem_resps(i).valid && mem_resps(i).bits.uop.dst_rtype === RT_FIX
int_iss_wakeups(i).bits := mem_resps(i).bits

int_ren_wakeups(i).valid := mem_resps(i).valid && mem_resps(i).bits.uop.dst_rtype === RT_FIX
int_ren_wakeups(i).bits := mem_resps(i).bits
iss_wu_idx += 1
ren_wu_idx += 1
}

// loop through each issue-port (exe_units are statically connected to an issue-port)
for (i <- 0 until exe_units.length) {
if (exe_units(i).writesIrf) {
Expand Down Expand Up @@ -704,7 +734,8 @@ class BoomCore(implicit p: Parameters) extends BoomModule
}

var iss_idx = 0
var iss_cnt = 0
var int_iss_cnt = 0
var mem_iss_cnt = 0
for (w <- 0 until exe_units.length) {
var fu_types = exe_units(w).io.fu_types
val exe_unit = exe_units(w)
Expand All @@ -717,14 +748,15 @@ class BoomCore(implicit p: Parameters) extends BoomModule
}

if (exe_unit.hasMem) {
iss_valids(iss_idx) := issue_units.mem_iq.io.iss_valids(0)
iss_uops(iss_idx) := issue_units.mem_iq.io.iss_uops(0)
issue_units.mem_iq.io.fu_types(0) := fu_types
iss_valids(iss_idx) := issue_units.mem_iq.io.iss_valids(mem_iss_cnt)
iss_uops(iss_idx) := issue_units.mem_iq.io.iss_uops(mem_iss_cnt)
issue_units.mem_iq.io.fu_types(mem_iss_cnt) := fu_types
mem_iss_cnt += 1
} else {
iss_valids(iss_idx) := issue_units.int_iq.io.iss_valids(iss_cnt)
iss_uops(iss_idx) := issue_units.int_iq.io.iss_uops(iss_cnt)
issue_units.int_iq.io.fu_types(iss_cnt) := fu_types
iss_cnt += 1
iss_valids(iss_idx) := issue_units.int_iq.io.iss_valids(int_iss_cnt)
iss_uops(iss_idx) := issue_units.int_iq.io.iss_uops(int_iss_cnt)
issue_units.int_iq.io.fu_types(int_iss_cnt) := fu_types
int_iss_cnt += 1
}
iss_idx += 1
}
Expand All @@ -739,13 +771,10 @@ class BoomCore(implicit p: Parameters) extends BoomModule
require (issue_units.count(_.iqType == IQT_MEM.litValue) == 1 || usingUnifiedMemIntIQs)
val mem_iq = issue_units.mem_iq

require (mem_iq.issueWidth == 1)
require (mem_iq.issueWidth <= 2)
issue_units.map(_.io.ld_miss := io.lsu.ld_miss)

// Share the memory port with other long latency operations.
val mem_unit = exe_units.memory_unit
val mem_resp = mem_unit.io.ll_iresp
mem_unit.io.com_exception := rob.io.flush.valid
mem_units.map(u => u.io.com_exception := rob.io.flush.valid)

// Wakeup (Issue & Writeback)
for {
Expand Down Expand Up @@ -920,8 +949,14 @@ class BoomCore(implicit p: Parameters) extends BoomModule
//-------------------------------------------------------------

var w_cnt = 1
// 0th goes to ll_wbarb
iregfile.io.write_ports(0) := WritePort(ll_wbarb.io.out, ipregSz, xLen, RT_FIX)
ll_wbarb.io.in(0) <> mem_resps(0)
assert (ll_wbarb.io.in(0).ready) // never backpressure the memory unit.
for (i <- 1 until memWidth) {
iregfile.io.write_ports(w_cnt) := WritePort(mem_resps(i), ipregSz, xLen, RT_FIX)
w_cnt += 1
}

for (i <- 0 until exe_units.length) {
if (exe_units(i).writesIrf) {
val wbresp = exe_units(i).io.iresp
Expand Down Expand Up @@ -956,16 +991,15 @@ class BoomCore(implicit p: Parameters) extends BoomModule
}
}
require(w_cnt == iregfile.io.write_ports.length)
ll_wbarb.io.in(0) <> mem_resp
assert (ll_wbarb.io.in(0).ready) // never backpressure the memory unit.


if (usingFPU) {
// Connect IFPU
fp_pipeline.io.from_int <> exe_units.ifpu_unit.io.ll_fresp
fp_pipeline.io.from_int <> exe_units.ifpu_unit.io.ll_fresp
// Connect FPIU
ll_wbarb.io.in(1) <> fp_pipeline.io.to_int
ll_wbarb.io.in(1) <> fp_pipeline.io.to_int
// Connect FLDs
fp_pipeline.io.ll_wport <> exe_units.memory_unit.io.ll_fresp
fp_pipeline.io.ll_wports <> exe_units.memory_units.map(_.io.ll_fresp)
}
if (usingRoCC) {
require(usingFPU)
Expand All @@ -987,6 +1021,14 @@ class BoomCore(implicit p: Parameters) extends BoomModule
rob.io.debug_wb_valids(0) := ll_wbarb.io.out.valid && ll_uop.dst_rtype =/= RT_X
rob.io.debug_wb_wdata(0) := ll_wbarb.io.out.bits.data
var cnt = 1
for (i <- 1 until memWidth) {
val mem_uop = mem_resps(i).bits.uop
rob.io.wb_resps(cnt).valid := mem_resps(i).valid && !(mem_uop.uses_stq && !mem_uop.is_amo)
rob.io.wb_resps(cnt).bits := mem_resps(i).bits
rob.io.debug_wb_valids(cnt) := mem_resps(i).valid && mem_uop.dst_rtype =/= RT_X
rob.io.debug_wb_wdata(cnt) := mem_resps(i).bits.data
cnt += 1
}
var f_cnt = 0 // rob fflags port index
for (eu <- exe_units) {
if (eu.writesIrf)
Expand All @@ -1013,7 +1055,7 @@ class BoomCore(implicit p: Parameters) extends BoomModule
}
}

require(cnt == numIrfWritePorts + 1)
require(cnt == numIrfWritePorts + memWidth)
if (usingFPU) {
for ((wdata, wakeup) <- fp_pipeline.io.debug_wb_wdata zip fp_pipeline.io.wakeups) {
rob.io.wb_resps(cnt) <> wakeup
Expand All @@ -1040,8 +1082,10 @@ class BoomCore(implicit p: Parameters) extends BoomModule
exe_units(brunit_idx).io.status := csr.io.status

// Connect breakpoint info to memaddrcalcunit
exe_units.memory_unit.io.status := csr.io.status
exe_units.memory_unit.io.bp := csr.io.bp
for (i <- 0 until memWidth) {
mem_units(i).io.status := csr.io.status
mem_units(i).io.bp := csr.io.bp
}

// LSU <> ROB
rob.io.lsu_clr_bsy := io.lsu.clr_bsy
Expand Down
17 changes: 9 additions & 8 deletions src/main/scala/exu/execution-units/execution-units.scala
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,8 @@ class ExecutionUnits(val fpu: Boolean)(implicit val p: Parameters) extends HasBo
exe_units.count(f)
}

lazy val memory_unit = {
require (exe_units.count(_.hasMem) == 1) // only one mem_unit supported
exe_units.find(_.hasMem).get
lazy val memory_units = {
exe_units.filter(_.hasMem)
}

lazy val br_unit = {
Expand Down Expand Up @@ -111,15 +110,17 @@ class ExecutionUnits(val fpu: Boolean)(implicit val p: Parameters) extends HasBo
val int_width = issueParams.find(_.iqType == IQT_INT.litValue).get.issueWidth

if (!usingUnifiedMemIntIQs) {
val memExeUnit = Module(new ALUExeUnit(
hasAlu = false,
hasMem = true))
for (w <- 0 until memWidth) {
val memExeUnit = Module(new ALUExeUnit(
hasAlu = false,
hasMem = true))

memExeUnit.io.ll_iresp.ready := DontCare

exe_units += memExeUnit
}
}

require(!(usingUnifiedMemIntIQs && memWidth != 1))

for (w <- 0 until int_width) {
def is_nth(n: Int): Boolean = w == ((n) % int_width)
Expand All @@ -130,7 +131,7 @@ class ExecutionUnits(val fpu: Boolean)(implicit val p: Parameters) extends HasBo
hasMul = is_nth(2),
hasDiv = is_nth(3),
hasIfpu = is_nth(4) && usingFPU,
hasMem = usingUnifiedMemIntIQs))
hasMem = is_nth(0) && usingUnifiedMemIntIQs))
exe_units += alu_exe_unit
}
} else {
Expand Down

0 comments on commit f6ffb9c

Please sign in to comment.