Skip to content

Commit

Permalink
Implement all vendor intrinsics used by the fimg crate
Browse files Browse the repository at this point in the history
  • Loading branch information
bjorn3 committed Nov 7, 2023
1 parent 0a35232 commit ecf79a3
Showing 1 changed file with 154 additions and 0 deletions.
154 changes: 154 additions & 0 deletions src/intrinsics/llvm_x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,160 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
}
}

"llvm.x86.avx2.packuswb" => {
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
intrinsic_args!(fx, args => (a, b); intrinsic);

assert_eq!(a.layout(), b.layout());
let layout = a.layout();

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
assert_eq!(lane_ty, fx.tcx.types.i16);
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
assert_eq!(lane_count * 2, ret_lane_count);

let zero = fx.bcx.ins().iconst(types::I16, 0);
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);

for idx in 0..lane_count / 2 {
let lane = a.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, zero);
let sat = fx.bcx.ins().umin(sat, max_u8);
let res = fx.bcx.ins().ireduce(types::I8, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
}

for idx in 0..lane_count / 2 {
let lane = b.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, zero);
let sat = fx.bcx.ins().umin(sat, max_u8);
let res = fx.bcx.ins().ireduce(types::I8, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
}

for idx in 0..lane_count / 2 {
let lane = a.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, zero);
let sat = fx.bcx.ins().umin(sat, max_u8);
let res = fx.bcx.ins().ireduce(types::I8, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
}

for idx in 0..lane_count / 2 {
let lane = b.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, zero);
let sat = fx.bcx.ins().umin(sat, max_u8);
let res = fx.bcx.ins().ireduce(types::I8, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
}
}

"llvm.x86.sse2.packssdw.128" => {
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
intrinsic_args!(fx, args => (a, b); intrinsic);

assert_eq!(a.layout(), b.layout());
let layout = a.layout();

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
assert_eq!(lane_ty, fx.tcx.types.i32);
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
assert_eq!(lane_count * 2, ret_lane_count);

let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);

for idx in 0..lane_count {
let lane = a.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, min_i16);
let sat = fx.bcx.ins().umin(sat, max_i16);
let res = fx.bcx.ins().ireduce(types::I16, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
}

for idx in 0..lane_count {
let lane = b.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, min_i16);
let sat = fx.bcx.ins().umin(sat, max_i16);
let res = fx.bcx.ins().ireduce(types::I16, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
}
}

"llvm.x86.avx2.packssdw" => {
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
intrinsic_args!(fx, args => (a, b); intrinsic);

assert_eq!(a.layout(), b.layout());
let layout = a.layout();

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
assert_eq!(lane_ty, fx.tcx.types.i32);
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
assert_eq!(lane_count * 2, ret_lane_count);

let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);

for idx in 0..lane_count / 2 {
let lane = a.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, min_i16);
let sat = fx.bcx.ins().umin(sat, max_i16);
let res = fx.bcx.ins().ireduce(types::I16, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
}

for idx in 0..lane_count / 2 {
let lane = b.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, min_i16);
let sat = fx.bcx.ins().umin(sat, max_i16);
let res = fx.bcx.ins().ireduce(types::I16, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
}

for idx in 0..lane_count / 2 {
let lane = a.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, min_i16);
let sat = fx.bcx.ins().umin(sat, max_i16);
let res = fx.bcx.ins().ireduce(types::I16, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
}

for idx in 0..lane_count / 2 {
let lane = b.value_lane(fx, idx).load_scalar(fx);
let sat = fx.bcx.ins().smax(lane, min_i16);
let sat = fx.bcx.ins().umin(sat, max_i16);
let res = fx.bcx.ins().ireduce(types::I16, sat);

let res_lane = CValue::by_val(res, ret_lane_layout);
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
}
}

_ => {
fx.tcx
.sess
Expand Down

0 comments on commit ecf79a3

Please sign in to comment.