diff --git a/compiler/rustc_const_eval/src/interpret/intrinsics.rs b/compiler/rustc_const_eval/src/interpret/intrinsics.rs
index fc7f1166af99a..aaefcb1b24c31 100644
--- a/compiler/rustc_const_eval/src/interpret/intrinsics.rs
+++ b/compiler/rustc_const_eval/src/interpret/intrinsics.rs
@@ -25,6 +25,31 @@ use super::{
 };
 use crate::fluent_generated as fluent;
 
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum MulAddType {
+    /// Used with `fma` and `simd_fma`, always uses fused-multiply-add
+    Fused,
+    /// Used with `fmuladd` and `simd_relaxed_fma`, nondeterministically determines whether to use
+    /// fma or simple multiply-add
+    Nondeterministic,
+}
+
+#[derive(Copy, Clone)]
+pub(crate) enum MinMax {
+    /// The IEEE `Minimum` operation - see `f32::minimum` etc
+    /// In particular, `-0.0` is considered smaller than `+0.0`.
+    Minimum,
+    /// The IEEE `MinNum` operation - see `f32::min` etc
+    /// In particular, if the inputs are `-0.0` and `+0.0`, the result is non-deterministic.
+    MinNum,
+    /// The IEEE `Maximum` operation - see `f32::maximum` etc
+    /// In particular, `-0.0` is considered smaller than `+0.0`.
+    Maximum,
+    /// The IEEE `MaxNum` operation - see `f32::max` etc
+    /// In particular, if the inputs are `-0.0` and `+0.0`, the result is non-deterministic.
+    MaxNum,
+}
+
 /// Directly returns an `Allocation` containing an absolute path representation of the given type.
 pub(crate) fn alloc_type_name<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> (AllocId, u64) {
     let path = crate::util::type_name(tcx, ty);
@@ -486,25 +511,33 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
                 self.write_scalar(Scalar::from_target_usize(align.bytes(), self), dest)?;
             }
 
-            sym::minnumf16 => self.float_min_intrinsic::<Half>(args, dest)?,
-            sym::minnumf32 => self.float_min_intrinsic::<Single>(args, dest)?,
-            sym::minnumf64 => self.float_min_intrinsic::<Double>(args, dest)?,
-            sym::minnumf128 => self.float_min_intrinsic::<Quad>(args, dest)?,
+            sym::minnumf16 => self.float_minmax_intrinsic::<Half>(args, MinMax::MinNum, dest)?,
+            sym::minnumf32 => self.float_minmax_intrinsic::<Single>(args, MinMax::MinNum, dest)?,
+            sym::minnumf64 => self.float_minmax_intrinsic::<Double>(args, MinMax::MinNum, dest)?,
+            sym::minnumf128 => self.float_minmax_intrinsic::<Quad>(args, MinMax::MinNum, dest)?,
 
-            sym::minimumf16 => self.float_minimum_intrinsic::<Half>(args, dest)?,
-            sym::minimumf32 => self.float_minimum_intrinsic::<Single>(args, dest)?,
-            sym::minimumf64 => self.float_minimum_intrinsic::<Double>(args, dest)?,
-            sym::minimumf128 => self.float_minimum_intrinsic::<Quad>(args, dest)?,
+            sym::minimumf16 => self.float_minmax_intrinsic::<Half>(args, MinMax::Minimum, dest)?,
+            sym::minimumf32 => {
+                self.float_minmax_intrinsic::<Single>(args, MinMax::Minimum, dest)?
+            }
+            sym::minimumf64 => {
+                self.float_minmax_intrinsic::<Double>(args, MinMax::Minimum, dest)?
+            }
+            sym::minimumf128 => self.float_minmax_intrinsic::<Quad>(args, MinMax::Minimum, dest)?,
 
-            sym::maxnumf16 => self.float_max_intrinsic::<Half>(args, dest)?,
-            sym::maxnumf32 => self.float_max_intrinsic::<Single>(args, dest)?,
-            sym::maxnumf64 => self.float_max_intrinsic::<Double>(args, dest)?,
-            sym::maxnumf128 => self.float_max_intrinsic::<Quad>(args, dest)?,
+            sym::maxnumf16 => self.float_minmax_intrinsic::<Half>(args, MinMax::MaxNum, dest)?,
+            sym::maxnumf32 => self.float_minmax_intrinsic::<Single>(args, MinMax::MaxNum, dest)?,
+            sym::maxnumf64 => self.float_minmax_intrinsic::<Double>(args, MinMax::MaxNum, dest)?,
+            sym::maxnumf128 => self.float_minmax_intrinsic::<Quad>(args, MinMax::MaxNum, dest)?,
 
-            sym::maximumf16 => self.float_maximum_intrinsic::<Half>(args, dest)?,
-            sym::maximumf32 => self.float_maximum_intrinsic::<Single>(args, dest)?,
-            sym::maximumf64 => self.float_maximum_intrinsic::<Double>(args, dest)?,
-            sym::maximumf128 => self.float_maximum_intrinsic::<Quad>(args, dest)?,
+            sym::maximumf16 => self.float_minmax_intrinsic::<Half>(args, MinMax::Maximum, dest)?,
+            sym::maximumf32 => {
+                self.float_minmax_intrinsic::<Single>(args, MinMax::Maximum, dest)?
+            }
+            sym::maximumf64 => {
+                self.float_minmax_intrinsic::<Double>(args, MinMax::Maximum, dest)?
+            }
+            sym::maximumf128 => self.float_minmax_intrinsic::<Quad>(args, MinMax::Maximum, dest)?,
 
             sym::copysignf16 => self.float_copysign_intrinsic::<Half>(args, dest)?,
             sym::copysignf32 => self.float_copysign_intrinsic::<Single>(args, dest)?,
@@ -612,14 +645,22 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
                 dest,
                 rustc_apfloat::Round::NearestTiesToEven,
             )?,
-            sym::fmaf16 => self.fma_intrinsic::<Half>(args, dest)?,
-            sym::fmaf32 => self.fma_intrinsic::<Single>(args, dest)?,
-            sym::fmaf64 => self.fma_intrinsic::<Double>(args, dest)?,
-            sym::fmaf128 => self.fma_intrinsic::<Quad>(args, dest)?,
-            sym::fmuladdf16 => self.float_muladd_intrinsic::<Half>(args, dest)?,
-            sym::fmuladdf32 => self.float_muladd_intrinsic::<Single>(args, dest)?,
-            sym::fmuladdf64 => self.float_muladd_intrinsic::<Double>(args, dest)?,
-            sym::fmuladdf128 => self.float_muladd_intrinsic::<Quad>(args, dest)?,
+            sym::fmaf16 => self.float_muladd_intrinsic::<Half>(args, dest, MulAddType::Fused)?,
+            sym::fmaf32 => self.float_muladd_intrinsic::<Single>(args, dest, MulAddType::Fused)?,
+            sym::fmaf64 => self.float_muladd_intrinsic::<Double>(args, dest, MulAddType::Fused)?,
+            sym::fmaf128 => self.float_muladd_intrinsic::<Quad>(args, dest, MulAddType::Fused)?,
+            sym::fmuladdf16 => {
+                self.float_muladd_intrinsic::<Half>(args, dest, MulAddType::Nondeterministic)?
+            }
+            sym::fmuladdf32 => {
+                self.float_muladd_intrinsic::<Single>(args, dest, MulAddType::Nondeterministic)?
+            }
+            sym::fmuladdf64 => {
+                self.float_muladd_intrinsic::<Double>(args, dest, MulAddType::Nondeterministic)?
+            }
+            sym::fmuladdf128 => {
+                self.float_muladd_intrinsic::<Quad>(args, dest, MulAddType::Nondeterministic)?
+            }
 
             // Unsupported intrinsic: skip the return_to_block below.
             _ => return interp_ok(false),
@@ -901,76 +942,45 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
         interp_ok(Scalar::from_bool(lhs_bytes == rhs_bytes))
     }
 
-    fn float_min_intrinsic<F>(
-        &mut self,
-        args: &[OpTy<'tcx, M::Provenance>],
-        dest: &PlaceTy<'tcx, M::Provenance>,
-    ) -> InterpResult<'tcx, ()>
-    where
-        F: rustc_apfloat::Float + rustc_apfloat::FloatConvert<F> + Into<Scalar<M::Provenance>>,
-    {
-        let a: F = self.read_scalar(&args[0])?.to_float()?;
-        let b: F = self.read_scalar(&args[1])?.to_float()?;
-        let res = if a == b {
-            // They are definitely not NaN (those are never equal), but they could be `+0` and `-0`.
-            // Let the machine decide which one to return.
-            M::equal_float_min_max(self, a, b)
-        } else {
-            self.adjust_nan(a.min(b), &[a, b])
-        };
-        self.write_scalar(res, dest)?;
-        interp_ok(())
-    }
-
-    fn float_max_intrinsic<F>(
-        &mut self,
-        args: &[OpTy<'tcx, M::Provenance>],
-        dest: &PlaceTy<'tcx, M::Provenance>,
-    ) -> InterpResult<'tcx, ()>
+    fn float_minmax<F>(
+        &self,
+        a: Scalar<M::Provenance>,
+        b: Scalar<M::Provenance>,
+        op: MinMax,
+    ) -> InterpResult<'tcx, Scalar<M::Provenance>>
     where
         F: rustc_apfloat::Float + rustc_apfloat::FloatConvert<F> + Into<Scalar<M::Provenance>>,
     {
-        let a: F = self.read_scalar(&args[0])?.to_float()?;
-        let b: F = self.read_scalar(&args[1])?.to_float()?;
-        let res = if a == b {
+        let a: F = a.to_float()?;
+        let b: F = b.to_float()?;
+        let res = if matches!(op, MinMax::MinNum | MinMax::MaxNum) && a == b {
             // They are definitely not NaN (those are never equal), but they could be `+0` and `-0`.
             // Let the machine decide which one to return.
             M::equal_float_min_max(self, a, b)
         } else {
-            self.adjust_nan(a.max(b), &[a, b])
+            let result = match op {
+                MinMax::Minimum => a.minimum(b),
+                MinMax::MinNum => a.min(b),
+                MinMax::Maximum => a.maximum(b),
+                MinMax::MaxNum => a.max(b),
+            };
+            self.adjust_nan(result, &[a, b])
         };
-        self.write_scalar(res, dest)?;
-        interp_ok(())
-    }
 
-    fn float_minimum_intrinsic<F>(
-        &mut self,
-        args: &[OpTy<'tcx, M::Provenance>],
-        dest: &PlaceTy<'tcx, M::Provenance>,
-    ) -> InterpResult<'tcx, ()>
-    where
-        F: rustc_apfloat::Float + rustc_apfloat::FloatConvert<F> + Into<Scalar<M::Provenance>>,
-    {
-        let a: F = self.read_scalar(&args[0])?.to_float()?;
-        let b: F = self.read_scalar(&args[1])?.to_float()?;
-        let res = a.minimum(b);
-        let res = self.adjust_nan(res, &[a, b]);
-        self.write_scalar(res, dest)?;
-        interp_ok(())
+        interp_ok(res.into())
     }
 
-    fn float_maximum_intrinsic<F>(
+    fn float_minmax_intrinsic<F>(
         &mut self,
         args: &[OpTy<'tcx, M::Provenance>],
+        op: MinMax,
         dest: &PlaceTy<'tcx, M::Provenance>,
     ) -> InterpResult<'tcx, ()>
     where
         F: rustc_apfloat::Float + rustc_apfloat::FloatConvert<F> + Into<Scalar<M::Provenance>>,
     {
-        let a: F = self.read_scalar(&args[0])?.to_float()?;
-        let b: F = self.read_scalar(&args[1])?.to_float()?;
-        let res = a.maximum(b);
-        let res = self.adjust_nan(res, &[a, b]);
+        let res =
+            self.float_minmax::<F>(self.read_scalar(&args[0])?, self.read_scalar(&args[1])?, op)?;
         self.write_scalar(res, dest)?;
         interp_ok(())
     }
@@ -1004,56 +1014,69 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
         interp_ok(())
     }
 
-    fn float_round_intrinsic<F>(
+    fn float_round<F>(
         &mut self,
-        args: &[OpTy<'tcx, M::Provenance>],
-        dest: &PlaceTy<'tcx, M::Provenance>,
+        x: Scalar<M::Provenance>,
         mode: rustc_apfloat::Round,
-    ) -> InterpResult<'tcx, ()>
+    ) -> InterpResult<'tcx, Scalar<M::Provenance>>
     where
         F: rustc_apfloat::Float + rustc_apfloat::FloatConvert<F> + Into<Scalar<M::Provenance>>,
     {
-        let x: F = self.read_scalar(&args[0])?.to_float()?;
+        let x: F = x.to_float()?;
         let res = x.round_to_integral(mode).value;
         let res = self.adjust_nan(res, &[x]);
-        self.write_scalar(res, dest)?;
-        interp_ok(())
+        interp_ok(res.into())
     }
 
-    fn fma_intrinsic<F>(
+    fn float_round_intrinsic<F>(
         &mut self,
         args: &[OpTy<'tcx, M::Provenance>],
         dest: &PlaceTy<'tcx, M::Provenance>,
+        mode: rustc_apfloat::Round,
     ) -> InterpResult<'tcx, ()>
     where
         F: rustc_apfloat::Float + rustc_apfloat::FloatConvert<F> + Into<Scalar<M::Provenance>>,
     {
-        let a: F = self.read_scalar(&args[0])?.to_float()?;
-        let b: F = self.read_scalar(&args[1])?.to_float()?;
-        let c: F = self.read_scalar(&args[2])?.to_float()?;
-
-        let res = a.mul_add(b, c).value;
-        let res = self.adjust_nan(res, &[a, b, c]);
+        let res = self.float_round::<F>(self.read_scalar(&args[0])?, mode)?;
         self.write_scalar(res, dest)?;
         interp_ok(())
     }
 
+    fn float_muladd<F>(
+        &self,
+        a: Scalar<M::Provenance>,
+        b: Scalar<M::Provenance>,
+        c: Scalar<M::Provenance>,
+        typ: MulAddType,
+    ) -> InterpResult<'tcx, Scalar<M::Provenance>>
+    where
+        F: rustc_apfloat::Float + rustc_apfloat::FloatConvert<F> + Into<Scalar<M::Provenance>>,
+    {
+        let a: F = a.to_float()?;
+        let b: F = b.to_float()?;
+        let c: F = c.to_float()?;
+
+        let fuse = typ == MulAddType::Fused || M::float_fuse_mul_add(self);
+
+        let res = if fuse { a.mul_add(b, c).value } else { ((a * b).value + c).value };
+        let res = self.adjust_nan(res, &[a, b, c]);
+        interp_ok(res.into())
+    }
+
     fn float_muladd_intrinsic<F>(
         &mut self,
         args: &[OpTy<'tcx, M::Provenance>],
         dest: &PlaceTy<'tcx, M::Provenance>,
+        typ: MulAddType,
     ) -> InterpResult<'tcx, ()>
     where
         F: rustc_apfloat::Float + rustc_apfloat::FloatConvert<F> + Into<Scalar<M::Provenance>>,
     {
-        let a: F = self.read_scalar(&args[0])?.to_float()?;
-        let b: F = self.read_scalar(&args[1])?.to_float()?;
-        let c: F = self.read_scalar(&args[2])?.to_float()?;
-
-        let fuse = M::float_fuse_mul_add(self);
+        let a = self.read_scalar(&args[0])?;
+        let b = self.read_scalar(&args[1])?;
+        let c = self.read_scalar(&args[2])?;
 
-        let res = if fuse { a.mul_add(b, c).value } else { ((a * b).value + c).value };
-        let res = self.adjust_nan(res, &[a, b, c]);
+        let res = self.float_muladd::<F>(a, b, c, typ)?;
         self.write_scalar(res, dest)?;
         interp_ok(())
     }
diff --git a/compiler/rustc_const_eval/src/interpret/intrinsics/simd.rs b/compiler/rustc_const_eval/src/interpret/intrinsics/simd.rs
index 0dba66ae93721..d39005b98731c 100644
--- a/compiler/rustc_const_eval/src/interpret/intrinsics/simd.rs
+++ b/compiler/rustc_const_eval/src/interpret/intrinsics/simd.rs
@@ -1,5 +1,6 @@
 use either::Either;
 use rustc_abi::Endian;
+use rustc_apfloat::ieee::{Double, Half, Quad, Single};
 use rustc_apfloat::{Float, Round};
 use rustc_middle::mir::interpret::{InterpErrorKind, UndefinedBehaviorInfo};
 use rustc_middle::ty::FloatTy;
@@ -8,17 +9,11 @@ use rustc_span::{Symbol, sym};
 use tracing::trace;
 
 use super::{
-    ImmTy, InterpCx, InterpResult, Machine, OpTy, PlaceTy, Provenance, Scalar, Size, interp_ok,
-    throw_ub_format,
+    ImmTy, InterpCx, InterpResult, Machine, MinMax, MulAddType, OpTy, PlaceTy, Provenance, Scalar,
+    Size, interp_ok, throw_ub_format,
 };
 use crate::interpret::Writeable;
 
-#[derive(Copy, Clone)]
-pub(crate) enum MinMax {
-    Min,
-    Max,
-}
-
 impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
     /// Returns `true` if emulation happened.
     /// Here we implement the intrinsics that are common to all CTFE instances; individual machines can add their own
@@ -125,10 +120,10 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
                             let op = op.to_scalar();
                             // "Bitwise" operation, no NaN adjustments
                             match float_ty {
-                                FloatTy::F16 => unimplemented!("f16_f128"),
+                                FloatTy::F16 => Scalar::from_f16(op.to_f16()?.abs()),
                                 FloatTy::F32 => Scalar::from_f32(op.to_f32()?.abs()),
                                 FloatTy::F64 => Scalar::from_f64(op.to_f64()?.abs()),
-                                FloatTy::F128 => unimplemented!("f16_f128"),
+                                FloatTy::F128 => Scalar::from_f128(op.to_f128()?.abs()),
                             }
                         }
                         Op::Round(rounding) => {
@@ -139,21 +134,12 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
                                     intrinsic_name
                                 )
                             };
+                            let op = op.to_scalar();
                             match float_ty {
-                                FloatTy::F16 => unimplemented!("f16_f128"),
-                                FloatTy::F32 => {
-                                    let f = op.to_scalar().to_f32()?;
-                                    let res = f.round_to_integral(rounding).value;
-                                    let res = self.adjust_nan(res, &[f]);
-                                    Scalar::from_f32(res)
-                                }
-                                FloatTy::F64 => {
-                                    let f = op.to_scalar().to_f64()?;
-                                    let res = f.round_to_integral(rounding).value;
-                                    let res = self.adjust_nan(res, &[f]);
-                                    Scalar::from_f64(res)
-                                }
-                                FloatTy::F128 => unimplemented!("f16_f128"),
+                                FloatTy::F16 => self.float_round::<Half>(op, rounding)?,
+                                FloatTy::F32 => self.float_round::<Single>(op, rounding)?,
+                                FloatTy::F64 => self.float_round::<Double>(op, rounding)?,
+                                FloatTy::F128 => self.float_round::<Quad>(op, rounding)?,
                             }
                         }
                         Op::Numeric(name) => {
@@ -216,8 +202,8 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
                     sym::simd_le => Op::MirOp(BinOp::Le),
                     sym::simd_gt => Op::MirOp(BinOp::Gt),
                     sym::simd_ge => Op::MirOp(BinOp::Ge),
-                    sym::simd_fmax => Op::FMinMax(MinMax::Max),
-                    sym::simd_fmin => Op::FMinMax(MinMax::Min),
+                    sym::simd_fmax => Op::FMinMax(MinMax::MaxNum),
+                    sym::simd_fmin => Op::FMinMax(MinMax::MinNum),
                     sym::simd_saturating_add => Op::SaturatingOp(BinOp::Add),
                     sym::simd_saturating_sub => Op::SaturatingOp(BinOp::Sub),
                     sym::simd_arith_offset => Op::WrappingOffset,
@@ -309,8 +295,8 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
                     sym::simd_reduce_xor => Op::MirOp(BinOp::BitXor),
                     sym::simd_reduce_any => Op::MirOpBool(BinOp::BitOr),
                     sym::simd_reduce_all => Op::MirOpBool(BinOp::BitAnd),
-                    sym::simd_reduce_max => Op::MinMax(MinMax::Max),
-                    sym::simd_reduce_min => Op::MinMax(MinMax::Min),
+                    sym::simd_reduce_max => Op::MinMax(MinMax::MaxNum),
+                    sym::simd_reduce_min => Op::MinMax(MinMax::MinNum),
                     _ => unreachable!(),
                 };
 
@@ -334,8 +320,8 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
                             } else {
                                 // Just boring integers, so NaNs to worry about
                                 let mirop = match mmop {
-                                    MinMax::Min => BinOp::Le,
-                                    MinMax::Max => BinOp::Ge,
+                                    MinMax::MinNum | MinMax::Minimum => BinOp::Le,
+                                    MinMax::MaxNum | MinMax::Maximum => BinOp::Ge,
                                 };
                                 if self.binary_op(mirop, &res, &op)?.to_scalar().to_bool()? {
                                     res
@@ -701,6 +687,43 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
                     };
                 }
             }
+            sym::simd_fma | sym::simd_relaxed_fma => {
+                // `simd_fma` should always deterministically use `mul_add`, whereas `relaxed_fma`
+                // is non-deterministic, and can use either `mul_add` or `a * b + c`
+                let typ = match intrinsic_name {
+                    sym::simd_fma => MulAddType::Fused,
+                    sym::simd_relaxed_fma => MulAddType::Nondeterministic,
+                    _ => unreachable!(),
+                };
+
+                let (a, a_len) = self.project_to_simd(&args[0])?;
+                let (b, b_len) = self.project_to_simd(&args[1])?;
+                let (c, c_len) = self.project_to_simd(&args[2])?;
+                let (dest, dest_len) = self.project_to_simd(&dest)?;
+
+                assert_eq!(dest_len, a_len);
+                assert_eq!(dest_len, b_len);
+                assert_eq!(dest_len, c_len);
+
+                for i in 0..dest_len {
+                    let a = self.read_scalar(&self.project_index(&a, i)?)?;
+                    let b = self.read_scalar(&self.project_index(&b, i)?)?;
+                    let c = self.read_scalar(&self.project_index(&c, i)?)?;
+                    let dest = self.project_index(&dest, i)?;
+
+                    let ty::Float(float_ty) = dest.layout.ty.kind() else {
+                        span_bug!(self.cur_span(), "{} operand is not a float", intrinsic_name)
+                    };
+
+                    let val = match float_ty {
+                        FloatTy::F16 => self.float_muladd::<Half>(a, b, c, typ)?,
+                        FloatTy::F32 => self.float_muladd::<Single>(a, b, c, typ)?,
+                        FloatTy::F64 => self.float_muladd::<Double>(a, b, c, typ)?,
+                        FloatTy::F128 => self.float_muladd::<Quad>(a, b, c, typ)?,
+                    };
+                    self.write_scalar(val, &dest)?;
+                }
+            }
 
             // Unsupported intrinsic: skip the return_to_block below.
             _ => return interp_ok(false),
@@ -711,12 +734,12 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
         interp_ok(true)
     }
 
-    fn fminmax_op<Prov: Provenance>(
+    fn fminmax_op(
         &self,
         op: MinMax,
-        left: &ImmTy<'tcx, Prov>,
-        right: &ImmTy<'tcx, Prov>,
-    ) -> InterpResult<'tcx, Scalar<Prov>> {
+        left: &ImmTy<'tcx, M::Provenance>,
+        right: &ImmTy<'tcx, M::Provenance>,
+    ) -> InterpResult<'tcx, Scalar<M::Provenance>> {
         assert_eq!(left.layout.ty, right.layout.ty);
         let ty::Float(float_ty) = left.layout.ty.kind() else {
             bug!("fmax operand is not a float")
@@ -724,28 +747,10 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
         let left = left.to_scalar();
         let right = right.to_scalar();
         interp_ok(match float_ty {
-            FloatTy::F16 => unimplemented!("f16_f128"),
-            FloatTy::F32 => {
-                let left = left.to_f32()?;
-                let right = right.to_f32()?;
-                let res = match op {
-                    MinMax::Min => left.min(right),
-                    MinMax::Max => left.max(right),
-                };
-                let res = self.adjust_nan(res, &[left, right]);
-                Scalar::from_f32(res)
-            }
-            FloatTy::F64 => {
-                let left = left.to_f64()?;
-                let right = right.to_f64()?;
-                let res = match op {
-                    MinMax::Min => left.min(right),
-                    MinMax::Max => left.max(right),
-                };
-                let res = self.adjust_nan(res, &[left, right]);
-                Scalar::from_f64(res)
-            }
-            FloatTy::F128 => unimplemented!("f16_f128"),
+            FloatTy::F16 => self.float_minmax::<Half>(left, right, op)?,
+            FloatTy::F32 => self.float_minmax::<Single>(left, right, op)?,
+            FloatTy::F64 => self.float_minmax::<Double>(left, right, op)?,
+            FloatTy::F128 => self.float_minmax::<Quad>(left, right, op)?,
         })
     }
 }
diff --git a/compiler/rustc_const_eval/src/interpret/machine.rs b/compiler/rustc_const_eval/src/interpret/machine.rs
index 1725635e0b479..236c35ec7b96a 100644
--- a/compiler/rustc_const_eval/src/interpret/machine.rs
+++ b/compiler/rustc_const_eval/src/interpret/machine.rs
@@ -290,7 +290,7 @@ pub trait Machine<'tcx>: Sized {
     }
 
     /// Determines whether the `fmuladd` intrinsics fuse the multiply-add or use separate operations.
-    fn float_fuse_mul_add(_ecx: &mut InterpCx<'tcx, Self>) -> bool;
+    fn float_fuse_mul_add(_ecx: &InterpCx<'tcx, Self>) -> bool;
 
     /// Called before a basic block terminator is executed.
     #[inline]
@@ -676,7 +676,7 @@ pub macro compile_time_machine(<$tcx: lifetime>) {
     }
 
     #[inline(always)]
-    fn float_fuse_mul_add(_ecx: &mut InterpCx<$tcx, Self>) -> bool {
+    fn float_fuse_mul_add(_ecx: &InterpCx<$tcx, Self>) -> bool {
         true
     }
 
diff --git a/src/tools/miri/src/intrinsics/simd.rs b/src/tools/miri/src/intrinsics/simd.rs
index 5f75657e0a220..2246edb9a6df2 100644
--- a/src/tools/miri/src/intrinsics/simd.rs
+++ b/src/tools/miri/src/intrinsics/simd.rs
@@ -1,5 +1,3 @@
-use rand::Rng;
-use rustc_apfloat::Float;
 use rustc_middle::ty::FloatTy;
 use rustc_middle::ty;
 
@@ -83,62 +81,6 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                     this.write_scalar(val, &dest)?;
                 }
             }
-            "fma" | "relaxed_fma" => {
-                let [a, b, c] = check_intrinsic_arg_count(args)?;
-                let (a, a_len) = this.project_to_simd(a)?;
-                let (b, b_len) = this.project_to_simd(b)?;
-                let (c, c_len) = this.project_to_simd(c)?;
-                let (dest, dest_len) = this.project_to_simd(dest)?;
-
-                assert_eq!(dest_len, a_len);
-                assert_eq!(dest_len, b_len);
-                assert_eq!(dest_len, c_len);
-
-                for i in 0..dest_len {
-                    let a = this.read_scalar(&this.project_index(&a, i)?)?;
-                    let b = this.read_scalar(&this.project_index(&b, i)?)?;
-                    let c = this.read_scalar(&this.project_index(&c, i)?)?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    let fuse: bool = intrinsic_name == "fma"
-                        || (this.machine.float_nondet && this.machine.rng.get_mut().random());
-
-                    // Works for f32 and f64.
-                    // FIXME: using host floats to work around https://github.com/rust-lang/miri/issues/2468.
-                    let ty::Float(float_ty) = dest.layout.ty.kind() else {
-                        span_bug!(this.cur_span(), "{} operand is not a float", intrinsic_name)
-                    };
-                    let val = match float_ty {
-                        FloatTy::F16 => unimplemented!("f16_f128"),
-                        FloatTy::F32 => {
-                            let a = a.to_f32()?;
-                            let b = b.to_f32()?;
-                            let c = c.to_f32()?;
-                            let res = if fuse {
-                                a.mul_add(b, c).value
-                            } else {
-                                ((a * b).value + c).value
-                            };
-                            let res = this.adjust_nan(res, &[a, b, c]);
-                            Scalar::from(res)
-                        }
-                        FloatTy::F64 => {
-                            let a = a.to_f64()?;
-                            let b = b.to_f64()?;
-                            let c = c.to_f64()?;
-                            let res = if fuse {
-                                a.mul_add(b, c).value
-                            } else {
-                                ((a * b).value + c).value
-                            };
-                            let res = this.adjust_nan(res, &[a, b, c]);
-                            Scalar::from(res)
-                        }
-                        FloatTy::F128 => unimplemented!("f16_f128"),
-                    };
-                    this.write_scalar(val, &dest)?;
-                }
-            }
             "expose_provenance" => {
                 let [op] = check_intrinsic_arg_count(args)?;
                 let (op, op_len) = this.project_to_simd(op)?;
diff --git a/src/tools/miri/src/machine.rs b/src/tools/miri/src/machine.rs
index 412640a112c09..9e0b6f15acba5 100644
--- a/src/tools/miri/src/machine.rs
+++ b/src/tools/miri/src/machine.rs
@@ -1294,8 +1294,8 @@ impl<'tcx> Machine<'tcx> for MiriMachine<'tcx> {
     }
 
     #[inline(always)]
-    fn float_fuse_mul_add(ecx: &mut InterpCx<'tcx, Self>) -> bool {
-        ecx.machine.float_nondet && ecx.machine.rng.get_mut().random()
+    fn float_fuse_mul_add(ecx: &InterpCx<'tcx, Self>) -> bool {
+        ecx.machine.float_nondet && ecx.machine.rng.borrow_mut().random()
     }
 
     #[inline(always)]
diff --git a/src/tools/miri/tests/pass/intrinsics/portable-simd.rs b/src/tools/miri/tests/pass/intrinsics/portable-simd.rs
index e2cd08733af1c..e5e84f0c5aeb3 100644
--- a/src/tools/miri/tests/pass/intrinsics/portable-simd.rs
+++ b/src/tools/miri/tests/pass/intrinsics/portable-simd.rs
@@ -6,18 +6,143 @@
     rustc_attrs,
     intrinsics,
     core_intrinsics,
-    repr_simd
+    repr_simd,
+    f16,
+    f128
 )]
-#![allow(incomplete_features, internal_features)]
+#![allow(incomplete_features, internal_features, non_camel_case_types)]
+use std::fmt::{self, Debug, Formatter};
 use std::intrinsics::simd as intrinsics;
 use std::ptr;
 use std::simd::StdFloat;
 use std::simd::prelude::*;
 
+#[repr(simd, packed)]
+#[derive(Copy)]
+struct PackedSimd<T, const N: usize>([T; N]);
+
+impl<T: Copy, const N: usize> Clone for PackedSimd<T, N> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T: PartialEq + Copy, const N: usize> PartialEq for PackedSimd<T, N> {
+    fn eq(&self, other: &Self) -> bool {
+        self.into_array() == other.into_array()
+    }
+}
+
+impl<T: Debug + Copy, const N: usize> Debug for PackedSimd<T, N> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        Debug::fmt(&self.into_array(), f)
+    }
+}
+
+type f16x2 = PackedSimd<f16, 2>;
+type f16x4 = PackedSimd<f16, 4>;
+
+type f128x2 = PackedSimd<f128, 2>;
+type f128x4 = PackedSimd<f128, 4>;
+
+impl<T: Copy, const N: usize> PackedSimd<T, N> {
+    fn splat(x: T) -> Self {
+        Self([x; N])
+    }
+    fn from_array(a: [T; N]) -> Self {
+        Self(a)
+    }
+    fn into_array(self) -> [T; N] {
+        // as we have `repr(packed)`, there shouldn't be any padding bytes
+        unsafe { std::mem::transmute_copy(&self) }
+    }
+}
+
 #[rustc_intrinsic]
 #[rustc_nounwind]
 pub unsafe fn simd_shuffle_const_generic<T, U, const IDX: &'static [u32]>(x: T, y: T) -> U;
 
+pub fn simd_ops_f16() {
+    use intrinsics::*;
+
+    // small hack to make type inference better
+    macro_rules! assert_eq {
+        ($a:expr, $b:expr $(,$t:tt)*) => {
+            ::std::assert_eq!($b, $a $(,$t)*)
+        }
+    }
+
+    let a = f16x4::splat(10.0);
+    let b = f16x4::from_array([1.0, 2.0, 3.0, -4.0]);
+
+    unsafe {
+        assert_eq!(simd_neg(b), f16x4::from_array([-1.0, -2.0, -3.0, 4.0]));
+        assert_eq!(simd_add(a, b), f16x4::from_array([11.0, 12.0, 13.0, 6.0]));
+        assert_eq!(simd_sub(a, b), f16x4::from_array([9.0, 8.0, 7.0, 14.0]));
+        assert_eq!(simd_mul(a, b), f16x4::from_array([10.0, 20.0, 30.0, -40.0]));
+        assert_eq!(simd_div(b, a), f16x4::from_array([0.1, 0.2, 0.3, -0.4]));
+        assert_eq!(simd_div(a, f16x4::splat(2.0)), f16x4::splat(5.0));
+        assert_eq!(simd_rem(a, b), f16x4::from_array([0.0, 0.0, 1.0, 2.0]));
+        assert_eq!(simd_fabs(b), f16x4::from_array([1.0, 2.0, 3.0, 4.0]));
+        assert_eq!(
+            simd_fmax(a, simd_mul(b, f16x4::splat(4.0))),
+            f16x4::from_array([10.0, 10.0, 12.0, 10.0])
+        );
+        assert_eq!(
+            simd_fmin(a, simd_mul(b, f16x4::splat(4.0))),
+            f16x4::from_array([4.0, 8.0, 10.0, -16.0])
+        );
+
+        assert_eq!(simd_fma(a, b, a), simd_add(simd_mul(a, b), a));
+        assert_eq!(simd_fma(b, b, a), simd_add(simd_mul(b, b), a));
+        assert_eq!(simd_fma(a, b, b), simd_add(simd_mul(a, b), b));
+        assert_eq!(
+            simd_fma(f16x4::splat(-3.2), b, f16x4::splat(f16::NEG_INFINITY)),
+            f16x4::splat(f16::NEG_INFINITY)
+        );
+
+        assert_eq!(simd_relaxed_fma(a, b, a), simd_add(simd_mul(a, b), a));
+        assert_eq!(simd_relaxed_fma(b, b, a), simd_add(simd_mul(b, b), a));
+        assert_eq!(simd_relaxed_fma(a, b, b), simd_add(simd_mul(a, b), b));
+        assert_eq!(
+            simd_relaxed_fma(f16x4::splat(-3.2), b, f16x4::splat(f16::NEG_INFINITY)),
+            f16x4::splat(f16::NEG_INFINITY)
+        );
+
+        assert_eq!(simd_fsqrt(simd_mul(a, a)), a);
+        assert_eq!(simd_fsqrt(simd_mul(b, b)), simd_fabs(b));
+
+        assert_eq!(simd_eq(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([0, !0, 0, 0]));
+        assert_eq!(simd_ne(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([!0, 0, !0, !0]));
+        assert_eq!(simd_le(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([0, !0, !0, 0]));
+        assert_eq!(simd_lt(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([0, 0, !0, 0]));
+        assert_eq!(simd_ge(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([!0, !0, 0, !0]));
+        assert_eq!(simd_gt(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([!0, 0, 0, !0]));
+
+        assert_eq!(simd_reduce_add_ordered(a, 0.0), 40.0f16);
+        assert_eq!(simd_reduce_add_ordered(b, 0.0), 2.0f16);
+        assert_eq!(simd_reduce_mul_ordered(a, 1.0), 10000.0f16);
+        assert_eq!(simd_reduce_mul_ordered(b, 1.0), -24.0f16);
+        assert_eq!(simd_reduce_max(a), 10.0f16);
+        assert_eq!(simd_reduce_max(b), 3.0f16);
+        assert_eq!(simd_reduce_min(a), 10.0f16);
+        assert_eq!(simd_reduce_min(b), -4.0f16);
+
+        assert_eq!(
+            simd_fmax(f16x2::from_array([0.0, f16::NAN]), f16x2::from_array([f16::NAN, 0.0])),
+            f16x2::from_array([0.0, 0.0])
+        );
+        assert_eq!(simd_reduce_max(f16x2::from_array([0.0, f16::NAN])), 0.0f16);
+        assert_eq!(simd_reduce_max(f16x2::from_array([f16::NAN, 0.0])), 0.0f16);
+        assert_eq!(
+            simd_fmin(f16x2::from_array([0.0, f16::NAN]), f16x2::from_array([f16::NAN, 0.0])),
+            f16x2::from_array([0.0, 0.0])
+        );
+        assert_eq!(simd_reduce_min(f16x2::from_array([0.0, f16::NAN])), 0.0f16);
+        assert_eq!(simd_reduce_min(f16x2::from_array([f16::NAN, 0.0])), 0.0f16);
+    }
+}
+
 fn simd_ops_f32() {
     let a = f32x4::splat(10.0);
     let b = f32x4::from_array([1.0, 2.0, 3.0, -4.0]);
@@ -148,6 +273,87 @@ fn simd_ops_f64() {
     assert_eq!(f64x2::from_array([f64::NAN, 0.0]).reduce_min(), 0.0);
 }
 
+pub fn simd_ops_f128() {
+    use intrinsics::*;
+
+    // small hack to make type inference better
+    macro_rules! assert_eq {
+        ($a:expr, $b:expr $(,$t:tt)*) => {
+            ::std::assert_eq!($b, $a $(,$t)*)
+        }
+    }
+
+    let a = f128x4::splat(10.0);
+    let b = f128x4::from_array([1.0, 2.0, 3.0, -4.0]);
+
+    unsafe {
+        assert_eq!(simd_neg(b), f128x4::from_array([-1.0, -2.0, -3.0, 4.0]));
+        assert_eq!(simd_add(a, b), f128x4::from_array([11.0, 12.0, 13.0, 6.0]));
+        assert_eq!(simd_sub(a, b), f128x4::from_array([9.0, 8.0, 7.0, 14.0]));
+        assert_eq!(simd_mul(a, b), f128x4::from_array([10.0, 20.0, 30.0, -40.0]));
+        assert_eq!(simd_div(b, a), f128x4::from_array([0.1, 0.2, 0.3, -0.4]));
+        assert_eq!(simd_div(a, f128x4::splat(2.0)), f128x4::splat(5.0));
+        assert_eq!(simd_rem(a, b), f128x4::from_array([0.0, 0.0, 1.0, 2.0]));
+        assert_eq!(simd_fabs(b), f128x4::from_array([1.0, 2.0, 3.0, 4.0]));
+        assert_eq!(
+            simd_fmax(a, simd_mul(b, f128x4::splat(4.0))),
+            f128x4::from_array([10.0, 10.0, 12.0, 10.0])
+        );
+        assert_eq!(
+            simd_fmin(a, simd_mul(b, f128x4::splat(4.0))),
+            f128x4::from_array([4.0, 8.0, 10.0, -16.0])
+        );
+
+        assert_eq!(simd_fma(a, b, a), simd_add(simd_mul(a, b), a));
+        assert_eq!(simd_fma(b, b, a), simd_add(simd_mul(b, b), a));
+        assert_eq!(simd_fma(a, b, b), simd_add(simd_mul(a, b), b));
+        assert_eq!(
+            simd_fma(f128x4::splat(-3.2), b, f128x4::splat(f128::NEG_INFINITY)),
+            f128x4::splat(f128::NEG_INFINITY)
+        );
+
+        assert_eq!(simd_relaxed_fma(a, b, a), simd_add(simd_mul(a, b), a));
+        assert_eq!(simd_relaxed_fma(b, b, a), simd_add(simd_mul(b, b), a));
+        assert_eq!(simd_relaxed_fma(a, b, b), simd_add(simd_mul(a, b), b));
+        assert_eq!(
+            simd_relaxed_fma(f128x4::splat(-3.2), b, f128x4::splat(f128::NEG_INFINITY)),
+            f128x4::splat(f128::NEG_INFINITY)
+        );
+
+        assert_eq!(simd_fsqrt(simd_mul(a, a)), a);
+        assert_eq!(simd_fsqrt(simd_mul(b, b)), simd_fabs(b));
+
+        assert_eq!(simd_eq(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([0, !0, 0, 0]));
+        assert_eq!(simd_ne(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([!0, 0, !0, !0]));
+        assert_eq!(simd_le(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([0, !0, !0, 0]));
+        assert_eq!(simd_lt(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([0, 0, !0, 0]));
+        assert_eq!(simd_ge(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([!0, !0, 0, !0]));
+        assert_eq!(simd_gt(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([!0, 0, 0, !0]));
+
+        assert_eq!(simd_reduce_add_ordered(a, 0.0), 40.0f128);
+        assert_eq!(simd_reduce_add_ordered(b, 0.0), 2.0f128);
+        assert_eq!(simd_reduce_mul_ordered(a, 1.0), 10000.0f128);
+        assert_eq!(simd_reduce_mul_ordered(b, 1.0), -24.0f128);
+        assert_eq!(simd_reduce_max(a), 10.0f128);
+        assert_eq!(simd_reduce_max(b), 3.0f128);
+        assert_eq!(simd_reduce_min(a), 10.0f128);
+        assert_eq!(simd_reduce_min(b), -4.0f128);
+
+        assert_eq!(
+            simd_fmax(f128x2::from_array([0.0, f128::NAN]), f128x2::from_array([f128::NAN, 0.0])),
+            f128x2::from_array([0.0, 0.0])
+        );
+        assert_eq!(simd_reduce_max(f128x2::from_array([0.0, f128::NAN])), 0.0f128);
+        assert_eq!(simd_reduce_max(f128x2::from_array([f128::NAN, 0.0])), 0.0f128);
+        assert_eq!(
+            simd_fmin(f128x2::from_array([0.0, f128::NAN]), f128x2::from_array([f128::NAN, 0.0])),
+            f128x2::from_array([0.0, 0.0])
+        );
+        assert_eq!(simd_reduce_min(f128x2::from_array([0.0, f128::NAN])), 0.0f128);
+        assert_eq!(simd_reduce_min(f128x2::from_array([f128::NAN, 0.0])), 0.0f128);
+    }
+}
+
 fn simd_ops_i32() {
     let a = i32x4::splat(10);
     let b = i32x4::from_array([1, 2, 3, -4]);
@@ -563,6 +769,31 @@ fn simd_gather_scatter() {
 }
 
 fn simd_round() {
+    unsafe {
+        use intrinsics::*;
+
+        assert_eq!(
+            simd_ceil(f16x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f16x4::from_array([1.0, 2.0, 2.0, -4.0])
+        );
+        assert_eq!(
+            simd_floor(f16x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f16x4::from_array([0.0, 1.0, 2.0, -5.0])
+        );
+        assert_eq!(
+            simd_round(f16x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f16x4::from_array([1.0, 1.0, 2.0, -5.0])
+        );
+        assert_eq!(
+            simd_round_ties_even(f16x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f16x4::from_array([1.0, 1.0, 2.0, -4.0])
+        );
+        assert_eq!(
+            simd_trunc(f16x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f16x4::from_array([0.0, 1.0, 2.0, -4.0])
+        );
+    }
+
     assert_eq!(
         f32x4::from_array([0.9, 1.001, 2.0, -4.5]).ceil(),
         f32x4::from_array([1.0, 2.0, 2.0, -4.0])
@@ -604,6 +835,31 @@ fn simd_round() {
         f64x4::from_array([0.9, 1.001, 2.0, -4.5]).trunc(),
         f64x4::from_array([0.0, 1.0, 2.0, -4.0])
     );
+
+    unsafe {
+        use intrinsics::*;
+
+        assert_eq!(
+            simd_ceil(f128x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f128x4::from_array([1.0, 2.0, 2.0, -4.0])
+        );
+        assert_eq!(
+            simd_floor(f128x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f128x4::from_array([0.0, 1.0, 2.0, -5.0])
+        );
+        assert_eq!(
+            simd_round(f128x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f128x4::from_array([1.0, 1.0, 2.0, -5.0])
+        );
+        assert_eq!(
+            simd_round_ties_even(f128x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f128x4::from_array([1.0, 1.0, 2.0, -4.0])
+        );
+        assert_eq!(
+            simd_trunc(f128x4::from_array([0.9, 1.001, 2.0, -4.5])),
+            f128x4::from_array([0.0, 1.0, 2.0, -4.0])
+        );
+    }
 }
 
 fn simd_intrinsics() {