From de673f6d226dc3184bedeabd8460c1a8b885cec7 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sat, 15 Jun 2024 18:04:01 +0200 Subject: [PATCH] [hardware] Add support for 16alt, 8, 8alt --- hardware/include/ara_pkg.sv | 79 +++++++++++++--- hardware/src/ara.sv | 2 +- hardware/src/ara_dispatcher.sv | 7 +- hardware/src/ara_soc.sv | 26 ++++-- hardware/src/ara_system.sv | 2 +- hardware/src/lane/lane.sv | 2 +- hardware/src/lane/operand_queue.sv | 49 +++++++++- hardware/src/lane/operand_queues_stage.sv | 2 +- hardware/src/lane/vector_fus_stage.sv | 2 +- hardware/src/lane/vmfpu.sv | 106 ++++++++++++++++++---- 10 files changed, 229 insertions(+), 48 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 6e6fc084a..b0845c9e6 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -45,29 +45,42 @@ package ara_pkg; FPExtSupportEnable = 1'b1 } fpext_support_e; - // The three bits correspond to {RVVD, RVVF, RVVH} - typedef enum logic [2:0] { - FPUSupportNone = 3'b000, - FPUSupportHalf = 3'b001, - FPUSupportSingle = 3'b010, - FPUSupportHalfSingle = 3'b011, - FPUSupportDouble = 3'b100, - FPUSupportSingleDouble = 3'b110, - FPUSupportHalfSingleDouble = 3'b111 + // The six bits correspond to {RVVD, RVVF, RVVH, RVVHA, RVVB, RVVBA} + typedef enum logic [5:0] { + FPUSupportNone = 6'b000000, + FPUSupportHalf = 6'b001000, + FPUSupportSingle = 6'b010000, + FPUSupportHalfSingle = 6'b011000, + FPUSupportDouble = 6'b100000, + FPUSupportSingleDouble = 6'b110000, + FPUSupportHalfSingleDouble = 6'b111000, + FPUSupportAll = 6'b111111 } fpu_support_e; function automatic logic RVVD(fpu_support_e e); - return e[2]; + return e[5]; endfunction : RVVD function automatic logic RVVF(fpu_support_e e); - return e[1]; + return e[4]; endfunction : RVVF function automatic logic RVVH(fpu_support_e e); - return e[0]; + return e[3]; endfunction : RVVH + function automatic logic RVVHA(fpu_support_e e); + return e[2]; + endfunction : RVVHA + + function automatic logic RVVB(fpu_support_e e); + return e[1]; + endfunction : RVVB + + function automatic logic RVVBA(fpu_support_e e); + return e[0]; + endfunction : RVVBA + // Multiplier latencies. localparam int unsigned LatMultiplierEW64 = 1; localparam int unsigned LatMultiplierEW32 = 1; @@ -218,6 +231,24 @@ package ara_pkg; } resize_e; // Floating-Point structs for re-encoding during widening FP operations + typedef struct packed { + logic s; + logic [3:0] e; + logic [2:0] m; + } fp8alt_t; + + typedef struct packed { + logic s; + logic [4:0] e; + logic [1:0] m; + } fp8_t; + + typedef struct packed { + logic s; + logic [7:0] e; + logic [6:0] m; + } fp16alt_t; + typedef struct packed { logic s; logic [4:0] e; @@ -248,6 +279,30 @@ package ara_pkg; endcase endfunction + function fp16_t fp16_from_fp8(fp8_t fp8, logic [$clog2(fp_mantissa_bits(rvv_pkg::EW8, 0)):0] fp8_m_lzc); + automatic fp8_t fp8_temp; + automatic fp16_t fp16; + + // Wide sign + fp16.s = fp8.s; + + // Wide exponent + // 15 - 7 = 8 + unique case(fp8.e) + '0: fp16.e = (fp8.m == '0) ? '0 : 5'd8 - {3'd0, fp8_m_lzc}; // Zero or Subnormal + '1: fp16.e = '1; // NaN + default: fp16.e = 5'd8 + fp8.e; // Normal + endcase + + // Wide mantissa + // If the input is NaN, output a quiet NaN mantissa. + // Otherwise, append trailing zeros to the mantissa. + fp8_temp.m = ((fp8.e == '0) && (fp8.m != '0)) ? (fp8.m << 1) << fp8_m_lzc : fp8.m; + fp16.m = ((fp8.e == '1) && (fp8.m != '0) ) ? {1'b1, 9'b0} : {fp8_temp.m, 8'b0}; + + fp16_from_fp8 = fp16; + endfunction + function fp32_t fp32_from_fp16(fp16_t fp16, logic [$clog2(fp_mantissa_bits(rvv_pkg::EW16, 0)):0] fp16_m_lzc); automatic fp16_t fp16_temp; automatic fp32_t fp32; diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 9b1b74463..43fe82c5b 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -10,7 +10,7 @@ module ara import ara_pkg::*; #( // RVV Parameters parameter int unsigned NrLanes = 0, // Number of parallel vector lanes. // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportAll, // External support for vfrec7, vfrsqrt7 parameter fpext_support_e FPExtSupport = FPExtSupportEnable, // Support for fixed-point data types diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 998e84230..80c254d2a 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -11,7 +11,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( parameter int unsigned NrLanes = 0, // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportAll, // External support for vfrec7, vfrsqrt7 parameter fpext_support_e FPExtSupport = FPExtSupportEnable, // Support for fixed-point data types @@ -2230,9 +2230,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end end - // Ara can support 16-bit float, 32-bit float, 64-bit float. + // Ara can support 8-bit float, 16-bit float, 32-bit float, 64-bit float. // Ara cannot support instructions who operates on more than 64 bits. unique case (FPUSupport) + FPUSupportAll: if (int'(ara_req_d.vtype.vsew) > int'(EW64) || int'(ara_req_d.eew_vs2) > int'(EW64)) + illegal_insn = 1'b1; FPUSupportHalfSingleDouble: if (int'(ara_req_d.vtype.vsew) < int'(EW16) || int'(ara_req_d.vtype.vsew) > int'(EW64) || int'(ara_req_d.eew_vs2) > int'(EW64)) illegal_insn = 1'b1; @@ -2476,6 +2478,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Ara can support 16-bit float, 32-bit float, 64-bit float. // Ara cannot support instructions who operates on more than 64 bits. unique case (FPUSupport) + FPUSupportAll: if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; FPUSupportHalfSingleDouble: if (int'(ara_req_d.vtype.vsew) < int'(EW16) || int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; FPUSupportHalfSingle: if (int'(ara_req_d.vtype.vsew) < int'(EW16) || diff --git a/hardware/src/ara_soc.sv b/hardware/src/ara_soc.sv index ca4a529cc..e7152b4b2 100644 --- a/hardware/src/ara_soc.sv +++ b/hardware/src/ara_soc.sv @@ -10,7 +10,7 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( // RVV Parameters parameter int unsigned NrLanes = 0, // Number of parallel vector lanes. // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportAll, // External support for vfrec7, vfrsqrt7 parameter fpext_support_e FPExtSupport = FPExtSupportEnable, // Support for fixed-point data types @@ -449,10 +449,10 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( AxiUserWidth : 1, NrLoadBufEntries : 2, FpuEn : 1, - XF16 : FPUSupport[0], - XF16ALT : 0, - XF8 : 0, - XF8ALT : 1, + XF16 : FPUSupport[3], + XF16ALT : FPUSupport[2], + XF8 : FPUSupport[1], + XF8ALT : FPUSupport[0], RVA : 1, RVB : 0, RVV : 1, @@ -463,8 +463,8 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( CvxifEn : 0, ZiCondExtEn : 0, RVSCLIC : 0, - RVF : FPUSupport[1], - RVD : FPUSupport[2], + RVF : FPUSupport[4], + RVD : FPUSupport[5], FpPresent : 1, NSX : 0, FLen : 64, @@ -614,4 +614,16 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( $error( "[ara] Cannot support half-precision floating-point on Ara if CVA6 does not support it."); + if (RVVHA(FPUSupport) && !CVA6AraConfig.XF16ALT) + $error( + "[ara] Cannot support alt-half-precision floating-point on Ara if CVA6 does not support it."); + + if (RVVB(FPUSupport) && !CVA6AraConfig.XF8) + $error( + "[ara] Cannot support byte-precision floating-point on Ara if CVA6 does not support it."); + + if (RVVBA(FPUSupport) && !CVA6AraConfig.XF8ALT) + $error( + "[ara] Cannot support alt-byte-precision floating-point on Ara if CVA6 does not support it."); + endmodule : ara_soc diff --git a/hardware/src/ara_system.sv b/hardware/src/ara_system.sv index 5ce66f592..9225de7f5 100644 --- a/hardware/src/ara_system.sv +++ b/hardware/src/ara_system.sv @@ -10,7 +10,7 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #( // RVV Parameters parameter int unsigned NrLanes = 0, // Number of parallel vector lanes. // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportAll, // External support for vfrec7, vfrsqrt7 parameter fpext_support_e FPExtSupport = FPExtSupportEnable, // Support for fixed-point data types diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv index b4e006aaf..995c8a49c 100644 --- a/hardware/src/lane/lane.sv +++ b/hardware/src/lane/lane.sv @@ -12,7 +12,7 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( parameter int unsigned NrLanes = 1, // Number of lanes // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportAll, // External support for vfrec7, vfrsqrt7 parameter fpext_support_e FPExtSupport = FPExtSupportEnable, // Support for fixed-point data types diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv index a4e0c22c3..fee0624a5 100644 --- a/hardware/src/lane/operand_queue.sv +++ b/hardware/src/lane/operand_queue.sv @@ -14,7 +14,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i parameter int unsigned NrSlaves = 1, parameter int unsigned NrLanes = 0, // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportAll, // Supported conversions parameter logic SupportIntExt2 = 1'b0, parameter logic SupportIntExt4 = 1'b0, @@ -153,9 +153,11 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Floating-point conversion // //////////////////////////////// + logic [$clog2(fp_mantissa_bits(EW8, 0))-1:0] fp8_m_lzc[4]; // 2 bits each logic [$clog2(fp_mantissa_bits(EW16, 0))-1:0] fp16_m_lzc[2]; // 4 bits each logic [$clog2(fp_mantissa_bits(EW32, 0))-1:0] fp32_m_lzc; // 5 bits each + fp8_t fp8[4]; fp16_t fp16[2]; fp32_t fp32; @@ -166,6 +168,20 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // By knowing the number of leading zeros in the mantissa, we can properly // adjust the exponent and shift the binary point to achieve a normalized // representation of the number. + if ({RVVB(FPUSupport), RVVH(FPUSupport)} == 2'b11) begin + // sew: 8-bit + for (genvar i = 0; i < 4; i++) begin + lzc #( + .WIDTH(fp_mantissa_bits(EW8, 0)), + .MODE (1) + ) leading_zero_e8_i ( + .in_i (fp8[i].m ), + .cnt_o (fp8_m_lzc[i]), + .empty_o(/*Unused*/ ) + ); + end + end + if ({RVVH(FPUSupport), RVVF(FPUSupport)} == 2'b11) begin // sew: 16-bit for (genvar i = 0; i < 2; i++) begin @@ -202,6 +218,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i incomplete_packet = 1'b0; last_packet = 1'b0; + for (int i = 0; i < 4; i++) fp8[i] = '0; for (int i = 0; i < 2; i++) fp16[i] = '0; for (int i = 0; i < 1; i++) fp32[i] = '0; @@ -243,6 +260,13 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i end MFPU_ADDRGEN: begin unique case (cmd.eew) + EW8: begin + unique case (cmd.ntr_red) + 2'b01: ntr.w64 = {8{16'h78}}; + 2'b10: ntr.w64 = {8{16'hf8}}; + default:; + endcase + end EW16: begin unique case (cmd.ntr_red) 2'b01: ntr.w64 = {4{16'h7c00}}; @@ -371,17 +395,31 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i end end - // Floating-Point re-encoding + // Floating-Point re-encoding (not supported for alt-16 and alt-8) OpQueueConversionWideFP2: begin if (FPUSupport != FPUSupportNone) begin - unique casez ({cmd.eew, RVVH(FPUSupport), RVVF(FPUSupport), RVVD(FPUSupport)}) - {EW16, 1'b1, 1'b1, 1'b?}: begin + unique casez ({cmd.eew, RVVBA(FPUSupport), RVVB(FPUSupport), + RVVHA(FPUSupport), RVVH(FPUSupport), + RVVF(FPUSupport), RVVD(FPUSupport)}) + {EW8, 1'b?, 1'b1, 1'b?, 1'b1, 1'b?, 1'b?}: begin + for (int e = 0; e < 1; e++) begin + automatic fp8_t fp8 = ibuf_operand[8*select + 16*e +: 8]; + automatic fp16_t fp16; + + fp16.s = fp8.s; + fp16.e = (fp8.e - 7) + 15; + fp16.m = {fp8.m, 7'b0}; + + conv_operand[16*e +: 16] = fp16; + end + end + {EW16, 1'b?, 1'b?, 1'b?, 1'b1, 1'b1, 1'b?}: begin for (int e = 0; e < 2; e++) begin fp16[e] = ibuf_operand[8*select + 32*e +: 16]; conv_operand[32*e +: 32] = fp32_from_fp16(fp16[e], fp16_m_lzc[e]); end end - {EW32, 1'b?, 1'b1, 1'b1}: begin + {EW32, 1'b?, 1'b?, 1'b?, 1'b?, 1'b1, 1'b1}: begin fp32 = ibuf_operand[8*select +: 32]; conv_operand = fp64_from_fp32(fp32, fp32_m_lzc); end @@ -393,6 +431,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Zero extension + Reordering for FP conversions OpQueueAdjustFPCvt: begin unique case (cmd.eew) + EW8: conv_operand = {32'b0, ibuf_operand[16 + 8*select +: 8], ibuf_operand[16 + 8*select +: 8], ibuf_operand[32 + 8*select +: 8], ibuf_operand[8*select +: 8]}; EW16: conv_operand = {32'b0, ibuf_operand[32 + 8*select +: 16], ibuf_operand[8*select +: 16]}; EW32: conv_operand = {32'b0, ibuf_operand[8*select +: 32]}; default:; diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv index 467584510..356661d70 100644 --- a/hardware/src/lane/operand_queues_stage.sv +++ b/hardware/src/lane/operand_queues_stage.sv @@ -9,7 +9,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; #( parameter int unsigned NrLanes = 0, // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble + parameter fpu_support_e FPUSupport = FPUSupportAll ) ( input logic clk_i, input logic rst_ni, diff --git a/hardware/src/lane/vector_fus_stage.sv b/hardware/src/lane/vector_fus_stage.sv index e76d497d4..59152326f 100644 --- a/hardware/src/lane/vector_fus_stage.sv +++ b/hardware/src/lane/vector_fus_stage.sv @@ -10,7 +10,7 @@ module vector_fus_stage import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; #( parameter int unsigned NrLanes = 0, // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportAll, // External support for vfrec7, vfrsqrt7 parameter fpext_support_e FPExtSupport = FPExtSupportEnable, // Support for fixed-point data types diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index d61aa61fc..ca5781f0c 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -11,7 +11,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; import cf_math_pkg::idx_width; #( parameter int unsigned NrLanes = 0, // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportAll, // External support for vfrec7, vfrsqrt7, rounding-toward-odd parameter fpext_support_e FPExtSupport = FPExtSupportEnable, // Support for fixed-point data types @@ -206,7 +206,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; case (sew) EW64: fpu_latency = LatFCompEW64; EW32: fpu_latency = LatFCompEW32; - default: fpu_latency = LatFCompEW16; + EW16: fpu_latency = LatFCompEW16; + default: fpu_latency = LatFCompEW8; endcase end endcase @@ -278,7 +279,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // FPU SIMD result needs to be shuffled for narrowing instructions before commit elen_t narrowing_shuffled_result; // Helper signal to shuffle the narrowed result - logic [3:0] narrowing_shuffle_be; + logic [7:0] narrowing_shuffle_be; ////////////////// // Multiplier // @@ -673,6 +674,20 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // [63:48] | [47:32] | [31:16] | [15:0] function automatic elen_t processed_osum_operand(elen_t mfpu_operand, logic [2:0] osum_issue_cnt, vew_e ew, logic is_masked, strb_t mask, elen_t ntr_val); case (ew) + EW8: begin + case (osum_issue_cnt) + 4'd0: processed_osum_operand = (is_masked & ~mask[0]) ? {56'd0, ntr_val[7:0] } : {56'd0, mfpu_operand[7:0] }; + 4'd1: processed_osum_operand = (is_masked & ~mask[4]) ? {56'd0, ntr_val[39:32]} : {56'd0, mfpu_operand[39:32]}; + 4'd2: processed_osum_operand = (is_masked & ~mask[2]) ? {56'd0, ntr_val[23:16]} : {56'd0, mfpu_operand[23:16]}; + 4'd3: processed_osum_operand = (is_masked & ~mask[6]) ? {56'd0, ntr_val[55:48]} : {56'd0, mfpu_operand[55:48]}; + 4'd4: processed_osum_operand = (is_masked & ~mask[1]) ? {56'd0, ntr_val[15:8] } : {56'd0, mfpu_operand[15:8] }; + 4'd5: processed_osum_operand = (is_masked & ~mask[5]) ? {56'd0, ntr_val[47:40]} : {56'd0, mfpu_operand[47:40]}; + 4'd6: processed_osum_operand = (is_masked & ~mask[3]) ? {56'd0, ntr_val[31:24]} : {56'd0, mfpu_operand[31:24]}; + 4'd7: processed_osum_operand = (is_masked & ~mask[7]) ? {56'd0, ntr_val[63:56]} : {56'd0, mfpu_operand[63:56]}; + // Default case, no meaning + default: processed_osum_operand = (is_masked & ~mask[7]) ? {56'd0, ntr_val[63:56]} : {56'd0, mfpu_operand[63:56]}; + endcase + end EW16: begin case (osum_issue_cnt) 4'd0: processed_osum_operand = (is_masked & ~mask[0]) ? {48'd0, ntr_val[15:0] } : {48'd0, mfpu_operand[15:0] }; @@ -726,9 +741,11 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Floating-point conversion // //////////////////////////////// + logic [$clog2(fp_mantissa_bits(EW8, 0))-1:0] fp8_m_lzc[4]; // 2 bits each logic [$clog2(fp_mantissa_bits(EW16, 0))-1:0] fp16_m_lzc[2]; // 4 bits each logic [$clog2(fp_mantissa_bits(EW32, 0))-1:0] fp32_m_lzc; // 5 bits each + fp8_t fp8[4]; fp16_t fp16[2]; fp32_t fp32; @@ -739,6 +756,20 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // By knowing the number of leading zeros in the mantissa, we can properly // adjust the exponent and shift the binary point to achieve a normalized // representation of the number. + if ({RVVB(FPUSupport), RVVH(FPUSupport)} == 2'b11) begin + // sew: 8-bit + for (genvar i = 0; i < 4; i++) begin + lzc #( + .WIDTH(fp_mantissa_bits(EW8, 0)), + .MODE (1) + ) leading_zero_e8_i ( + .in_i (fp8[i].m ), + .cnt_o (fp8_m_lzc[i]), + .empty_o(/*Unused*/ ) + ); + end + end + if ({RVVH(FPUSupport), RVVF(FPUSupport)} == 2'b11) begin // sew: 16-bit for (genvar i = 0; i < 2; i++) begin @@ -806,8 +837,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; Width : 64, EnableVectors: 1'b1, EnableNanBox : 1'b1, - FpFmtMask : {RVVF(FPUSupport), RVVD(FPUSupport), RVVH(FPUSupport), 1'b0, 1'b0, 1'b0}, - IntFmtMask : {1'b0, 1'b1, 1'b1, 1'b1} + FpFmtMask : {RVVF(FPUSupport), RVVD(FPUSupport), RVVH(FPUSupport), RVVB(FPUSupport), RVVHA(FPUSupport), RVVBA(FPUSupport)}, + IntFmtMask : {1'b1, 1'b1, 1'b1, 1'b1} }; // Implementation (number of registers etc) @@ -965,6 +996,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; fp_rm = RNE; // positive infinity case (vinsn_issue_q.vtype.vsew) + EW8: ntr_val = {8{8'h78}}; EW16: ntr_val = {4{16'h7c00}}; EW32: ntr_val = {2{32'h7f800000}}; default: // EW64 @@ -976,6 +1008,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; fp_rm = RTZ; // negative infinity case (vinsn_issue_q.vtype.vsew) + EW8: ntr_val = {8{8'hf8}}; EW16: ntr_val = {4{16'hfc00}}; EW32: ntr_val = {2{32'hff800000}}; default: // EW64 @@ -988,10 +1021,17 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // vtype.vsew encodes the destination format // cvt_resize is reused as neutral value for reductions unique case (vinsn_issue_q.vtype.vsew) + EW8: begin + fp_src_fmt = (vinsn_issue_q.cvt_resize == CVT_NARROW && !is_reduction(vinsn_issue_q.op)) ? FP16 : FP32; + fp_dst_fmt = FP8; + fp_int_fmt = (vinsn_issue_q.cvt_resize == CVT_NARROW && !is_reduction(vinsn_issue_q.op) && fp_op == I2F) ? INT16 : INT8; + end EW16: begin - fp_src_fmt = (vinsn_issue_q.cvt_resize == CVT_NARROW && !is_reduction(vinsn_issue_q.op)) ? FP32 : FP16; + fp_src_fmt = (vinsn_issue_q.cvt_resize == CVT_WIDE && !is_reduction(vinsn_issue_q.op)) ? FP8 : + ((vinsn_issue_q.cvt_resize == CVT_NARROW && !is_reduction(vinsn_issue_q.op)) ? FP32 : FP16); fp_dst_fmt = FP16; - fp_int_fmt = (vinsn_issue_q.cvt_resize == CVT_NARROW && !is_reduction(vinsn_issue_q.op) && fp_op == I2F) ? INT32 : INT16; + fp_int_fmt = (vinsn_issue_q.cvt_resize == CVT_WIDE && !is_reduction(vinsn_issue_q.op) && fp_op == I2F) ? INT8 : + ((vinsn_issue_q.cvt_resize == CVT_NARROW && !is_reduction(vinsn_issue_q.op) && fp_op == I2F) ? INT32 : INT16); end EW32: begin fp_src_fmt = (vinsn_issue_q.cvt_resize == CVT_WIDE && !is_reduction(vinsn_issue_q.op)) ? FP16 : @@ -1143,7 +1183,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; if (FPExtSupport) begin - // vfrec7 + // vfrec7 (support only for 16,32,64-bit floating-point) unique case (vinsn_processing_q.vtype.vsew) EW16: begin for (int h = 0; h < 4; h++) vfrec7_out_e16[h] = @@ -1180,7 +1220,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; end endcase - //vfrsqrt7 + //vfrsqrt7 (support only for 16,32,64-bit floating-point) unique case (vinsn_processing_q.vtype.vsew) EW16: begin for (int h = 0; h < 4; h++) vfrsqrt7_out_e16[h] = @@ -1239,6 +1279,13 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // 2) Encode the mask in the bit after each comparison result if (vinsn_processing_q.op inside {[VMFEQ:VMFGE]}) begin unique case (vinsn_processing_q.vtype.vsew) + EW8: begin + for (int b = 0; b < 8; b++) vfpu_processed_result[8*b] = + (vinsn_processing_q.op == VMFNE) ? + ~vfpu_processed_result[8*b] : + vfpu_processed_result[8*b]; + for (int b = 0; b < 8; b++) vfpu_processed_result[8*b+1] = vfpu_mask[1*b]; + end EW16: begin for (int b = 0; b < 4; b++) vfpu_processed_result[16*b] = (vinsn_processing_q.op == VMFNE) ? @@ -1383,6 +1430,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; ? {vinsn_issue_q.use_vs2, vinsn_issue_q.use_vd_op, vinsn_issue_q.use_vs1} : {vinsn_issue_q.use_vd_op, vinsn_issue_q.use_vs2, vinsn_issue_q.use_vs1}; + for (int i = 0; i < 4; i++) fp8[i] = '0; for (int i = 0; i < 2; i++) fp16[i] = '0; for (int i = 0; i < 1; i++) fp32[i] = '0; @@ -1422,6 +1470,11 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Sign injection unique case (vinsn_issue_q.vtype.vsew) + EW8: for (int b = 0; b < 8; b++) begin + operand_a[8*b+7] = operand_a[8*b+7] ^ fp_sign[0]; + operand_b[8*b+7] = operand_b[8*b+7] ^ fp_sign[1]; + operand_c[8*b+7] = operand_c[8*b+7] ^ fp_sign[2]; + end EW16: for (int b = 0; b < 4; b++) begin operand_a[16*b+15] = operand_a[16*b+15] ^ fp_sign[0]; operand_b[16*b+15] = operand_b[16*b+15] ^ fp_sign[1]; @@ -1529,22 +1582,33 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Narrowing FPU results need to be shuffled before being saved for storing unique case (vinsn_processing_q.vtype.vsew) + EW8: begin + narrowing_shuffled_result[63:56] = unit_out_result[31:24]; + narrowing_shuffled_result[55:48] = unit_out_result[31:24]; + narrowing_shuffled_result[47:40] = unit_out_result[23:16]; + narrowing_shuffled_result[39:32] = unit_out_result[23:16]; + narrowing_shuffled_result[31:24] = unit_out_result[15:8]; + narrowing_shuffled_result[23:16] = unit_out_result[15:8]; + narrowing_shuffled_result[15:8] = unit_out_result[7:0]; + narrowing_shuffled_result[7:0] = unit_out_result[7:0]; + narrowing_shuffle_be = !narrowing_select_out_q ? 8'b01010101 : 8'b10101010; + end EW16: begin narrowing_shuffled_result[63:48] = unit_out_result[31:16]; narrowing_shuffled_result[47:32] = unit_out_result[31:16]; narrowing_shuffled_result[31:16] = unit_out_result[15:0]; narrowing_shuffled_result[15:0] = unit_out_result[15:0]; - narrowing_shuffle_be = !narrowing_select_out_q ? 4'b0101 : 4'b1010; + narrowing_shuffle_be = !narrowing_select_out_q ? 8'b00110011 : 8'b11001100; end EW32: begin narrowing_shuffled_result[63:32] = unit_out_result[31:0]; narrowing_shuffled_result[31:0] = unit_out_result[31:0]; - narrowing_shuffle_be = !narrowing_select_out_q ? 4'b0011 : 4'b1100; + narrowing_shuffle_be = !narrowing_select_out_q ? 8'b00001111 : 8'b11110000; end default: begin narrowing_shuffled_result[63:32] = unit_out_result[31:0]; narrowing_shuffled_result[31:0] = unit_out_result[31:0]; - narrowing_shuffle_be = !narrowing_select_out_q ? 4'b0101 : 4'b1010; + narrowing_shuffle_be = !narrowing_select_out_q ? 8'b00110011 : 8'b11001100; end endcase @@ -1568,11 +1632,11 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; result_queue_d[result_queue_write_pnt_q].id = vinsn_processing_q.id; result_queue_d[result_queue_write_pnt_q].addr = vaddr(vinsn_processing_q.vd, NrLanes) + ((vinsn_processing_q.vl - to_process_cnt_q) >> (int'(EW64) - vinsn_processing_q.vtype.vsew)); - // FP narrowing instructions pack the result in two different cycles, and only some 16-bit slices are active + // FP narrowing instructions pack the result in two different cycles, and only some 8-bit slices are active if (narrowing(vinsn_processing_q.cvt_resize)) begin - for (int b = 0; b < 4; b++) begin + for (int b = 0; b < 8; b++) begin if (narrowing_shuffle_be[b]) - result_queue_d[result_queue_write_pnt_q].wdata[b*16 +: 16] = narrowing_shuffled_result[b*16 +: 16]; + result_queue_d[result_queue_write_pnt_q].wdata[b*8 +: 8] = narrowing_shuffled_result[b*8 +: 8]; end end else begin result_queue_d[result_queue_write_pnt_q].wdata = unit_out_result; @@ -2204,17 +2268,25 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; if (FPUSupport != FPUSupportNone) begin if (vfu_operation_i.wide_fp_imm) begin unique casez ({vfu_operation_i.vtype.vsew, + RVVB(FPUSupport), RVVH(FPUSupport), RVVF(FPUSupport), RVVD(FPUSupport)}) - {EW32, 1'b1, 1'b1, 1'b?}: begin + {EW16, 1'b1, 1'b1, 1'b?, 1'b?}: begin + for (int e = 0; e < 4; e++) begin + fp8[e] = vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[7:0]; + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[16*e +: 16] = + fp16_from_fp8(fp8[e], fp8_m_lzc[e]); + end + end + {EW32, 1'b?, 1'b1, 1'b1, 1'b?}: begin for (int e = 0; e < 2; e++) begin fp16[e] = vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[15:0]; vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[32*e +: 32] = fp32_from_fp16(fp16[e], fp16_m_lzc[e]); end end - {EW64, 1'b?, 1'b1, 1'b1}: begin + {EW64, 1'b?, 1'b?, 1'b1, 1'b1}: begin fp32 = vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[31:0]; vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op = fp64_from_fp32(fp32, fp32_m_lzc);