Skip to content

Commit

Permalink
Merge pull request #171 from riscv-boom/no_slow_mul
Browse files Browse the repository at this point in the history
[exu] Deprecate unpipelined integer multiplier
  • Loading branch information
abejgonzalez committed Jan 28, 2019
2 parents e53d931 + cf75da9 commit 1223663
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 56 deletions.
8 changes: 4 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,6 @@ jobs:
boomconfig-run-csmith-tests:
docker:
- image: riscvboom/riscvboom-images:0.0.5
no_output_timeout: 30m
environment:
JVM_OPTS: -Xmx3200m # Customize the JVM maximum heap limit
TERM: dumb
Expand All @@ -294,6 +293,7 @@ jobs:
- run:
name: Run BoomConfig csmith tests
command: .circleci/build-run-csmith-tests.sh BoomConfig 50
no_output_timeout: 30m

boomconfig-run-benchmark-tests:
docker:
Expand Down Expand Up @@ -355,7 +355,6 @@ jobs:
smallboomconfig-run-csmith-tests:
docker:
- image: riscvboom/riscvboom-images:0.0.5
no_output_timeout: 30m
environment:
JVM_OPTS: -Xmx3200m # Customize the JVM maximum heap limit
TERM: dumb
Expand All @@ -379,6 +378,7 @@ jobs:
- run:
name: Run SmallBoomConfig csmith tests
command: .circleci/build-run-csmith-tests.sh SmallBoomConfig 50
no_output_timeout: 30m

# No benchmark tests because those are rv64gc
smallrv32boomconfig-run-assembly-tests:
Expand Down Expand Up @@ -465,7 +465,6 @@ jobs:
mediumboomconfig-run-csmith-tests:
docker:
- image: riscvboom/riscvboom-images:0.0.5
no_output_timeout: 30m
environment:
JVM_OPTS: -Xmx3200m # Customize the JVM maximum heap limit
TERM: dumb
Expand All @@ -490,6 +489,7 @@ jobs:
- run:
name: Run MediumBoomConfig csmith tests
command: .circleci/build-run-csmith-tests.sh MediumBoomConfig 50
no_output_timeout: 30m

mediumboomconfig-run-benchmark-tests:
docker:
Expand Down Expand Up @@ -551,7 +551,6 @@ jobs:
megaboomconfig-run-csmith-tests:
docker:
- image: riscvboom/riscvboom-images:0.0.5
no_output_timeout: 30m
environment:
JVM_OPTS: -Xmx3200m # Customize the JVM maximum heap limit
TERM: dumb
Expand All @@ -576,6 +575,7 @@ jobs:
- run:
name: Run MegaBoomConfig csmith tests
command: .circleci/build-run-csmith-tests.sh MegaBoomConfig 50
no_output_timeout: 30m

megaboomconfig-run-benchmark-tests:
docker:
Expand Down
69 changes: 31 additions & 38 deletions src/main/scala/exu/execute.scala
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,14 @@ class ALUExeUnit(
has_mul : Boolean = false,
has_div : Boolean = false,
has_fdiv : Boolean = false,
has_ifpu : Boolean = false,
use_slow_mul : Boolean = false)
has_ifpu : Boolean = false)
(implicit p: Parameters)
extends ExecutionUnit(
num_rf_read_ports = if (has_fpu) 3 else 2,
num_rf_write_ports = 1,
num_bypass_stages =
(if (has_fpu && has_alu) p(tile.TileKey).core.fpu.get.dfmaLatency
else if (has_alu && has_mul && !use_slow_mul) 3 //TODO XXX p(tile.TileKey).core.imulLatency
else if (has_alu && has_mul) 3 //TODO XXX p(tile.TileKey).core.imulLatency
else if (has_alu) 1 else 0),
data_width = if (has_fpu || has_fdiv) 65 else 64,
bypassable = has_alu,
Expand All @@ -152,34 +151,30 @@ class ALUExeUnit(
has_fdiv = has_fdiv,
has_ifpu = has_ifpu)(p)
{
val has_muldiv = has_div || (has_mul && use_slow_mul)

val out_str = new StringBuilder
out_str.append("\n ExeUnit--")
if (has_alu) out_str.append("\n - ALU")
if (has_fpu) out_str.append("\n - FPU (Latency: " + dfmaLatency + ")")
if (has_mul && !use_slow_mul) out_str.append("\n - Mul (pipelined)")
if (has_div && has_mul && use_slow_mul) out_str.append("\n - Mul/Div (unpipelined)")
else if (has_mul && use_slow_mul) out_str.append("\n - Mul (unpipelined)")
else if (has_div) out_str.append("\n - Div")
if (has_mul) out_str.append("\n - Mul")
if (has_div) out_str.append("\n - Div")
if (has_fdiv) out_str.append("\n - FDiv/FSqrt")
if (has_ifpu) out_str.append("\n - IFPU (for read port access)")

override def toString: String = out_str.toString

val muldiv_busy = WireInit(false.B)
val div_busy = WireInit(false.B)
val fdiv_busy = WireInit(false.B)

// The Functional Units --------------------
val fu_units = ArrayBuffer[FunctionalUnit]()

io.fu_types := FU_ALU |
Mux(has_fpu.B, FU_FPU, 0.U) |
Mux((has_mul && !use_slow_mul).B, FU_MUL, 0.U) |
(Mux(!muldiv_busy && (has_mul && use_slow_mul).B, FU_MUL, 0.U)) |
(Mux(!muldiv_busy && has_div.B, FU_DIV, 0.U)) |
(Mux(shares_csr_wport.B, FU_CSR, 0.U)) |
(Mux(is_branch_unit.B, FU_BRU, 0.U)) |
Mux(has_mul.B, FU_MUL, 0.U) |
Mux(!div_busy && has_div.B, FU_DIV, 0.U) |
Mux(shares_csr_wport.B, FU_CSR, 0.U) |
Mux(is_branch_unit.B, FU_BRU, 0.U) |
Mux(!fdiv_busy && has_fdiv.B, FU_FDV, 0.U)


Expand Down Expand Up @@ -222,7 +217,7 @@ class ALUExeUnit(

// Pipelined, IMul Unit ------------------
var imul: PipelinedMulUnit = null
if (has_mul && !use_slow_mul)
if (has_mul)
{
imul = Module(new PipelinedMulUnit(imulLatency, xLen))
imul.io <> DontCare
Expand Down Expand Up @@ -297,30 +292,28 @@ class ALUExeUnit(
fu_units += fdivsqrt
}

// Mul/Div/Rem Unit -----------------------
var muldiv: MulDivUnit = null
val muldiv_resp_val = WireInit(false.B)
if (has_muldiv)
// Div/Rem Unit -----------------------
var div: DivUnit = null
val div_resp_val = WireInit(false.B)
if (has_div)
{
muldiv = Module(new MulDivUnit(xLen))
muldiv.io <> DontCare
muldiv.io.req.valid := io.req.valid &&
((io.req.bits.uop.fu_code_is(FU_DIV) && has_div.B) ||
(io.req.bits.uop.fu_code_is(FU_MUL) && (has_mul && use_slow_mul).B))
muldiv.io.req.bits.uop := io.req.bits.uop
muldiv.io.req.bits.rs1_data := io.req.bits.rs1_data
muldiv.io.req.bits.rs2_data := io.req.bits.rs2_data
muldiv.io.brinfo := io.brinfo
muldiv.io.req.bits.kill := io.req.bits.kill
div = Module(new DivUnit(xLen))
div.io <> DontCare
div.io.req.valid := io.req.valid && io.req.bits.uop.fu_code_is(FU_DIV) && has_div.B
div.io.req.bits.uop := io.req.bits.uop
div.io.req.bits.rs1_data := io.req.bits.rs1_data
div.io.req.bits.rs2_data := io.req.bits.rs2_data
div.io.brinfo := io.brinfo
div.io.req.bits.kill := io.req.bits.kill

// share write port with the pipelined units
muldiv.io.resp.ready := !(fu_units.map(_.io.resp.valid).reduce(_|_))
div.io.resp.ready := !(fu_units.map(_.io.resp.valid).reduce(_|_))

muldiv_resp_val := muldiv.io.resp.valid
muldiv_busy := !muldiv.io.req.ready ||
(io.req.valid && (io.req.bits.uop.fu_code_is(FU_DIV) ||
(io.req.bits.uop.fu_code_is(FU_MUL) && (has_mul && use_slow_mul).B)))
fu_units += muldiv
div_resp_val := div.io.resp.valid
div_busy := !div.io.req.ready ||
(io.req.valid && io.req.bits.uop.fu_code_is(FU_DIV))

fu_units += div
}

// Outputs (Write Port #0) ---------------
Expand All @@ -339,9 +332,9 @@ class ALUExeUnit(

io.resp(0).bits.fflags := Mux(fpu_resp_val, fpu_resp_fflags, fdiv_resp_fflags)

assert ((PopCount(fu_units.map(_.io.resp.valid)) <= 1.U && !muldiv_resp_val && !fdiv_resp_val) ||
(PopCount(fu_units.map(_.io.resp.valid)) <= 2.U && (muldiv_resp_val || fdiv_resp_val)) ||
(PopCount(fu_units.map(_.io.resp.valid)) <= 3.U && muldiv_resp_val && fdiv_resp_val)
assert ((PopCount(fu_units.map(_.io.resp.valid)) <= 1.U && !div_resp_val && !fdiv_resp_val) ||
(PopCount(fu_units.map(_.io.resp.valid)) <= 2.U && (div_resp_val || fdiv_resp_val)) ||
(PopCount(fu_units.map(_.io.resp.valid)) <= 3.U && div_resp_val && fdiv_resp_val)
, "Multiple functional units are fighting over the write port.")
}

Expand Down
1 change: 0 additions & 1 deletion src/main/scala/exu/execution_units.scala
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ class ExecutionUnits(fpu: Boolean = false)(implicit val p: Parameters) extends H
val aluExeUnit = Module(new ALUExeUnit(is_branch_unit = true
, shares_csr_wport = true
, has_mul = true
, use_slow_mul = false
, has_div = true
, has_ifpu = int_width==1
))
Expand Down
29 changes: 16 additions & 13 deletions src/main/scala/exu/functional_unit.scala
Original file line number Diff line number Diff line change
Expand Up @@ -739,26 +739,29 @@ abstract class IterativeFunctionalUnit(data_width: Int)(implicit p: Parameters)
}


class MulDivUnit(data_width: Int)(implicit p: Parameters) extends IterativeFunctionalUnit(data_width)(p)
class DivUnit(data_width: Int)(implicit p: Parameters) extends IterativeFunctionalUnit(data_width)(p)
{
val muldiv = Module(new freechips.rocketchip.rocket.MulDiv(mulDivParams, width = data_width))

// We don't use the iterative multiply functionality here.
// Instead we use the PipelinedMultiplier
val div = Module(new freechips.rocketchip.rocket.MulDiv(mulDivParams, width = data_width))

// request
muldiv.io.req.valid := io.req.valid && !this.do_kill
muldiv.io.req.bits.dw := io.req.bits.uop.ctrl.fcn_dw
muldiv.io.req.bits.fn := io.req.bits.uop.ctrl.op_fcn
muldiv.io.req.bits.in1 := io.req.bits.rs1_data
muldiv.io.req.bits.in2 := io.req.bits.rs2_data
muldiv.io.req.bits.tag := DontCare
io.req.ready := muldiv.io.req.ready
div.io.req.valid := io.req.valid && !this.do_kill
div.io.req.bits.dw := io.req.bits.uop.ctrl.fcn_dw
div.io.req.bits.fn := io.req.bits.uop.ctrl.op_fcn
div.io.req.bits.in1 := io.req.bits.rs1_data
div.io.req.bits.in2 := io.req.bits.rs2_data
div.io.req.bits.tag := DontCare
io.req.ready := div.io.req.ready

// handle pipeline kills and branch misspeculations
muldiv.io.kill := this.do_kill
div.io.kill := this.do_kill

// response
io.resp.valid := muldiv.io.resp.valid && !this.do_kill
muldiv.io.resp.ready := io.resp.ready
io.resp.bits.data := muldiv.io.resp.bits.data
io.resp.valid := div.io.resp.valid && !this.do_kill
div.io.resp.ready := io.resp.ready
io.resp.bits.data := div.io.resp.bits.data
}

class PipelinedMulUnit(num_stages: Int, data_width: Int)(implicit p: Parameters)
Expand Down

0 comments on commit 1223663

Please sign in to comment.