From 4e679ea89cc68829c4b9efc73121467bee56519c Mon Sep 17 00:00:00 2001 From: Sergei Lebedev <185856+superbobry@users.noreply.github.com> Date: Wed, 23 Apr 2025 02:53:35 +0100 Subject: [PATCH 001/245] [MLIR] [python] Fixed the signature of `_OperationBase.get_asm` (#136676) It claimed to return an `io.StringIO` or an `io.BytesIO`, but it did in fact return `str` or `bytes`. --- mlir/python/mlir/_mlir_libs/_mlir/ir.pyi | 31 +++++++++++++++--------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi index 1c8080c5d6d2e..6c5f91d757cdc 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi @@ -45,9 +45,8 @@ from __future__ import annotations import abc import collections from collections.abc import Callable, Sequence -import io from pathlib import Path -from typing import Any, BinaryIO, ClassVar, TypeVar, overload +from typing import Any, BinaryIO, ClassVar, Literal, TypeVar, overload __all__ = [ "AffineAddExpr", @@ -196,6 +195,19 @@ class _OperationBase: Detaches the operation from its parent block. """ def erase(self) -> None: ... + + @overload + def get_asm( + binary: Literal[True], + large_elements_limit: int | None = None, + enable_debug_info: bool = False, + pretty_debug_info: bool = False, + print_generic_op_form: bool = False, + use_local_scope: bool = False, + assume_verified: bool = False, + skip_regions: bool = False, + ) -> bytes: ... + @overload def get_asm( self, binary: bool = False, @@ -206,19 +218,14 @@ class _OperationBase: use_local_scope: bool = False, assume_verified: bool = False, skip_regions: bool = False, - ) -> io.BytesIO | io.StringIO: + ) -> str: """ - Gets the assembly form of the operation with all options available. + Returns the assembly form of the operation. - Args: - binary: Whether to return a bytes (True) or str (False) object. Defaults to - False. - ... others ...: See the print() method for common keyword arguments for - configuring the printout. - Returns: - Either a bytes or str object, depending on the setting of the 'binary' - argument. + See the print() method for common keyword arguments for configuring + the output. """ + def move_after(self, other: _OperationBase) -> None: """ Puts self immediately after the other operation in its parent block. From 2484060ad970b692443a6a1e7d3bef2797aa751b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 22 Apr 2025 19:24:29 -0700 Subject: [PATCH 002/245] [RISCV] Clear kill flags after replaceRegWith in RISCVFoldMemOffset. (#136762) Any kill flags that were present for the old register are not valid for the replacement and the replacement may have extended the live range of the replacement register. --- llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp | 1 + llvm/test/CodeGen/RISCV/fold-mem-offset.mir | 43 ++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/fold-mem-offset.mir diff --git a/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp b/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp index 989e9d859d64f..aa8da1486faca 100644 --- a/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp +++ b/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp @@ -274,6 +274,7 @@ bool RISCVFoldMemOffset::runOnMachineFunction(MachineFunction &MF) { MemMI->getOperand(2).setImm(NewOffset); MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); + MRI.clearKillFlags(MI.getOperand(1).getReg()); MI.eraseFromParent(); } } diff --git a/llvm/test/CodeGen/RISCV/fold-mem-offset.mir b/llvm/test/CodeGen/RISCV/fold-mem-offset.mir new file mode 100644 index 0000000000000..41afa26e70641 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/fold-mem-offset.mir @@ -0,0 +1,43 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -mtriple=riscv32 -run-pass=riscv-fold-mem-offset -verify-machineinstrs -o - | FileCheck %s + +--- +name: crash +tracksRegLiveness: true +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +body: | + bb.0: + liveins: $x10, $x11 + + ; CHECK-LABEL: name: crash + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x10 + ; CHECK-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 3 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:gpr = ADD killed [[SLLI]], [[COPY1]] + ; CHECK-NEXT: [[LUI:%[0-9]+]]:gpr = LUI 23 + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:gpr = ADD [[ADD]], [[LUI]] + ; CHECK-NEXT: [[ADD2:%[0-9]+]]:gpr = ADD [[ADD]], [[LUI]] + ; CHECK-NEXT: [[LW:%[0-9]+]]:gpr = LW killed [[ADD2]], 1792 + ; CHECK-NEXT: [[LW1:%[0-9]+]]:gpr = LW killed [[ADD1]], 1796 + ; CHECK-NEXT: $x10 = COPY [[LW]] + ; CHECK-NEXT: $x11 = COPY [[LW1]] + ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 + %1:gpr = COPY $x11 + %0:gpr = COPY $x10 + %3:gpr = SLLI %1, 3 + %4:gpr = ADD killed %3, %0 + %5:gpr = LUI 23 + %6:gpr = ADDI %5, 1792 + %7:gpr = ADD %4, killed %6 + %8:gpr = ADD %4, %5 + %9:gpr = LW killed %8, 1792 + %10:gpr = LW killed %7, 4 + $x10 = COPY %9 + $x11 = COPY %10 + PseudoRET implicit $x10, implicit $x11 +... From 122e5151ba84560de824d46f7b636502d41f2aa0 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Tue, 22 Apr 2025 19:52:03 -0700 Subject: [PATCH 003/245] gn build: Port d1cce66469d0 more --- llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn index 43b8979776902..6a431601ca867 100644 --- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn @@ -32,6 +32,7 @@ static_library("Sema") { "//clang/lib/Edit", "//clang/lib/Lex", "//clang/lib/Support", + "//llvm/lib/Frontend/OpenACC", "//llvm/lib/Frontend/OpenMP", "//llvm/lib/MC", "//llvm/lib/Support", From 4f71655b64a815143d2aedb22b8f423f7ce99e29 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 22 Apr 2025 21:07:31 -0700 Subject: [PATCH 004/245] [clang-format] Fix a bug in parsing C-style cast of lambdas (#136099) Fix #135959 --- clang/lib/Format/UnwrappedLineParser.cpp | 24 ++++++++++++++----- clang/unittests/Format/TokenAnnotatorTest.cpp | 10 ++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index b9430d4389feb..df58e5ef4d6a3 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -2368,13 +2368,25 @@ bool UnwrappedLineParser::tryToParseLambdaIntroducer() { const FormatToken *Previous = FormatTok->Previous; const FormatToken *LeftSquare = FormatTok; nextToken(); - if ((Previous && ((Previous->Tok.getIdentifierInfo() && - !Previous->isOneOf(tok::kw_return, tok::kw_co_await, - tok::kw_co_yield, tok::kw_co_return)) || - Previous->closesScope())) || - LeftSquare->isCppStructuredBinding(IsCpp)) { - return false; + if (Previous) { + if (Previous->Tok.getIdentifierInfo() && + !Previous->isOneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield, + tok::kw_co_return)) { + return false; + } + if (Previous->closesScope()) { + // Not a potential C-style cast. + if (Previous->isNot(tok::r_paren)) + return false; + const auto *BeforeRParen = Previous->getPreviousNonComment(); + // Lambdas can be cast to function types only, e.g. `std::function` + // and `int (*)()`. + if (!BeforeRParen || !BeforeRParen->isOneOf(tok::greater, tok::r_paren)) + return false; + } } + if (LeftSquare->isCppStructuredBinding(IsCpp)) + return false; if (FormatTok->is(tok::l_square) || tok::isLiteral(FormatTok->Tok.getKind())) return false; if (FormatTok->is(tok::r_square)) { diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index a6410f3c8fe53..73858e87c832a 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -2165,6 +2165,16 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) { // FIXME: // EXPECT_TOKEN(Tokens[13], tok::l_paren, TT_LambdaDefinitionLParen); EXPECT_TOKEN(Tokens[17], tok::l_brace, TT_LambdaLBrace); + + Tokens = annotate("auto foo{(std::function)[] { return 0; }};"); + ASSERT_EQ(Tokens.size(), 23u) << Tokens; + EXPECT_TOKEN(Tokens[13], tok::l_square, TT_LambdaLSquare); + EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_LambdaLBrace); + + Tokens = annotate("auto foo{(int (*)())[] { return 0; }};"); + ASSERT_EQ(Tokens.size(), 21u) << Tokens; + EXPECT_TOKEN(Tokens[11], tok::l_square, TT_LambdaLSquare); + EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_LambdaLBrace); } TEST_F(TokenAnnotatorTest, UnderstandsFunctionAnnotations) { From 9efabbbbe58bd8bc2141ba1c914f79376e09cbcf Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 22 Apr 2025 21:08:09 -0700 Subject: [PATCH 005/245] [clang-format] Fix a bug in lexing C++ UDL ending in $ (#136476) Fix #61612 --- clang/lib/Format/FormatTokenLexer.cpp | 29 +++++++++++++++++++ clang/lib/Format/FormatTokenLexer.h | 1 + clang/unittests/Format/TokenAnnotatorTest.cpp | 6 ++++ 3 files changed, 36 insertions(+) diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index 5c4e1f814d9b7..a4c94ac411fe0 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -128,6 +128,12 @@ void FormatTokenLexer::tryMergePreviousTokens() { if (Style.isCpp() && tryTransformTryUsageForC()) return; + if ((Style.Language == FormatStyle::LK_Cpp || + Style.Language == FormatStyle::LK_ObjC) && + tryMergeUserDefinedLiteral()) { + return; + } + if (Style.isJavaScript() || Style.isCSharp()) { static const tok::TokenKind NullishCoalescingOperator[] = {tok::question, tok::question}; @@ -559,6 +565,29 @@ bool FormatTokenLexer::tryMergeGreaterGreater() { return true; } +bool FormatTokenLexer::tryMergeUserDefinedLiteral() { + if (Tokens.size() < 2) + return false; + + auto *First = Tokens.end() - 2; + auto &Suffix = First[1]; + if (Suffix->hasWhitespaceBefore() || Suffix->TokenText != "$") + return false; + + auto &Literal = First[0]; + if (!Literal->Tok.isLiteral()) + return false; + + auto &Text = Literal->TokenText; + if (!Text.ends_with("_")) + return false; + + Text = StringRef(Text.data(), Text.size() + 1); + ++Literal->ColumnWidth; + Tokens.erase(&Suffix); + return true; +} + bool FormatTokenLexer::tryMergeTokens(ArrayRef Kinds, TokenType NewType) { if (Tokens.size() < Kinds.size()) diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h index 61474a3f9ada8..3f001bc69415d 100644 --- a/clang/lib/Format/FormatTokenLexer.h +++ b/clang/lib/Format/FormatTokenLexer.h @@ -48,6 +48,7 @@ class FormatTokenLexer { bool tryMergeLessLess(); bool tryMergeGreaterGreater(); + bool tryMergeUserDefinedLiteral(); bool tryMergeNSStringLiteral(); bool tryMergeJSPrivateIdentifier(); bool tryMergeCSharpStringLiteral(); diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 73858e87c832a..e540af85aff3a 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3985,6 +3985,12 @@ TEST_F(TokenAnnotatorTest, IdentifierPackage) { EXPECT_FALSE(Tokens[0]->isObjCAccessSpecifier()); } +TEST_F(TokenAnnotatorTest, UserDefinedLiteral) { + auto Tokens = annotate("auto dollars = 2_$;"); + ASSERT_EQ(Tokens.size(), 6u) << Tokens; + EXPECT_EQ(Tokens[3]->TokenText, "2_$"); +} + } // namespace } // namespace format } // namespace clang From 037657de7e5ccd4a37054829874a209b82fb8be7 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 22 Apr 2025 21:08:56 -0700 Subject: [PATCH 006/245] [clang-format] Correctly annotate kw_operator in using decls (#136545) Fix #136541 --- clang/lib/Format/TokenAnnotator.cpp | 6 ++++-- clang/unittests/Format/TokenAnnotatorTest.cpp | 5 +++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index b4f303e281c1d..6d861d19117e2 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -3977,8 +3977,10 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { FormatToken *AfterLastAttribute = nullptr; FormatToken *ClosingParen = nullptr; - for (auto *Tok = FirstNonComment ? FirstNonComment->Next : nullptr; Tok; - Tok = Tok->Next) { + for (auto *Tok = FirstNonComment && FirstNonComment->isNot(tok::kw_using) + ? FirstNonComment->Next + : nullptr; + Tok; Tok = Tok->Next) { if (Tok->is(TT_StartOfName)) SeenName = true; if (Tok->Previous->EndsCppAttributeGroup) diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index e540af85aff3a..87b2f329d57cf 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -1084,6 +1084,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsOverloadedOperators) { ASSERT_EQ(Tokens.size(), 11u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::identifier, TT_FunctionDeclarationName); EXPECT_TOKEN(Tokens[7], tok::l_paren, TT_OverloadedOperatorLParen); + + Tokens = annotate("using std::operator==;"); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; + // Not TT_FunctionDeclarationName. + EXPECT_TOKEN(Tokens[3], tok::kw_operator, TT_Unknown); } TEST_F(TokenAnnotatorTest, OverloadedOperatorInTemplate) { From afc030dd30e377ca7bf225a97179fa1b64eedd28 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 22 Apr 2025 21:09:21 -0700 Subject: [PATCH 007/245] [clang-format] Don't test stability if JS format test fails (#136662) --- clang/unittests/Format/FormatTestJS.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp index 1cfacc060d944..91577b9a49167 100644 --- a/clang/unittests/Format/FormatTestJS.cpp +++ b/clang/unittests/Format/FormatTestJS.cpp @@ -48,18 +48,22 @@ class FormatTestJS : public testing::Test { static void verifyFormat( StringRef Code, const FormatStyle &Style = getGoogleStyle(FormatStyle::LK_JavaScript)) { - EXPECT_EQ(Code.str(), format(Code, Style)) << "Expected code is not stable"; - std::string Result = format(test::messUp(Code), Style); - EXPECT_EQ(Code.str(), Result) << "Formatted:\n" << Result; + auto Result = format(test::messUp(Code), Style); + EXPECT_EQ(Code, Result) << "Formatted:\n" << Result; + if (Code != Result) + return; + EXPECT_EQ(Code, format(Code, Style)) << "Expected code is not stable"; } static void verifyFormat( StringRef Expected, StringRef Code, const FormatStyle &Style = getGoogleStyle(FormatStyle::LK_JavaScript)) { - EXPECT_EQ(Expected.str(), format(Expected, Style)) + auto Result = format(Code, Style); + EXPECT_EQ(Expected, Result) << "Formatted:\n" << Result; + if (Expected != Result) + return; + EXPECT_EQ(Expected, format(Expected, Style)) << "Expected code is not stable"; - std::string Result = format(Code, Style); - EXPECT_EQ(Expected.str(), Result) << "Formatted:\n" << Result; } }; From 68d89e931619ce5c9bc6fffcbe2d5b5268047f3c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 22 Apr 2025 21:25:41 -0700 Subject: [PATCH 008/245] [RISCV] Remove stale comment. NFC --- llvm/lib/TargetParser/RISCVISAInfo.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index 1e7144ce6d22b..ff0174210f87f 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -643,7 +643,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension, for (const char *Ext : RISCVGImplications) { auto Version = findDefaultVersion(Ext); assert(Version && "Default extension version not found?"); - // Postpone AddExtension until end of this function ISAInfo->Exts[std::string(Ext)] = {Version->Major, Version->Minor}; } break; From 34a4c58018730736b940c4db4d694feed3266f52 Mon Sep 17 00:00:00 2001 From: Michele Scandale Date: Tue, 22 Apr 2025 21:29:07 -0700 Subject: [PATCH 009/245] [clang] Rework `hasBooleanRepresentation`. (#136038) This is a follow-up of 13aac46332f607a38067b5ddd466071683b8c255. This commit adjusts the implementation of `hasBooleanRepresentation` to be somewhat aligned to `hasIntegerRepresentation`. In particular vector of booleans should be handled in `hasBooleanRepresentation`, while `_Atomic(bool)` should not. --- clang/include/clang/AST/Type.h | 5 +++-- clang/lib/AST/Type.cpp | 19 +++++++++---------- clang/lib/CodeGen/CGExpr.cpp | 20 +++++++++++++------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 20ff529c7e0c6..86d43e1a05b55 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2793,8 +2793,9 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { /// of some sort, e.g., it is a floating-point type or a vector thereof. bool hasFloatingRepresentation() const; - /// Determine whether this type has a boolean representation - /// of some sort. + /// Determine whether this type has a boolean representation -- i.e., it is a + /// boolean type, an enum type whose underlying type is a boolean type, or a + /// vector of booleans. bool hasBooleanRepresentation() const; // Type Checking Functions: Check to see if this type is structurally the diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 67cd690af7499..08be90eab6537 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2346,16 +2346,15 @@ bool Type::isArithmeticType() const { } bool Type::hasBooleanRepresentation() const { - if (isBooleanType()) - return true; - - if (const EnumType *ET = getAs()) - return ET->getDecl()->getIntegerType()->isBooleanType(); - - if (const AtomicType *AT = getAs()) - return AT->getValueType()->hasBooleanRepresentation(); - - return false; + if (const auto *VT = dyn_cast(CanonicalType)) + return VT->getElementType()->isBooleanType(); + if (const auto *ET = dyn_cast(CanonicalType)) { + return ET->getDecl()->isComplete() && + ET->getDecl()->getIntegerType()->isBooleanType(); + } + if (const auto *IT = dyn_cast(CanonicalType)) + return IT->getNumBits() == 1; + return isBooleanType(); } Type::ScalarTypeKind Type::getScalarTypeKind() const { diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index abb88477062fc..786a56eed7ed5 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -1920,7 +1920,7 @@ static bool getRangeForType(CodeGenFunction &CGF, QualType Ty, llvm::MDNode *CodeGenFunction::getRangeForLoadFromType(QualType Ty) { llvm::APInt Min, End; if (!getRangeForType(*this, Ty, Min, End, CGM.getCodeGenOpts().StrictEnums, - Ty->hasBooleanRepresentation())) + Ty->hasBooleanRepresentation() && !Ty->isVectorType())) return nullptr; llvm::MDBuilder MDHelper(getLLVMContext()); @@ -1948,7 +1948,7 @@ bool CodeGenFunction::EmitScalarRangeCheck(llvm::Value *Value, QualType Ty, if (!HasBoolCheck && !HasEnumCheck) return false; - bool IsBool = Ty->hasBooleanRepresentation() || + bool IsBool = (Ty->hasBooleanRepresentation() && !Ty->isVectorType()) || NSAPI(CGM.getContext()).isObjCBOOLType(Ty); bool NeedsBoolCheck = HasBoolCheck && IsBool; bool NeedsEnumCheck = HasEnumCheck && Ty->getAs(); @@ -2068,11 +2068,8 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile, /// by ConvertType) to its load/store type (as returned by /// convertTypeForLoadStore). llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) { - if (Ty->hasBooleanRepresentation() || Ty->isBitIntType()) { - llvm::Type *StoreTy = convertTypeForLoadStore(Ty, Value->getType()); - bool Signed = Ty->isSignedIntegerOrEnumerationType(); - return Builder.CreateIntCast(Value, StoreTy, Signed, "storedv"); - } + if (auto *AtomicTy = Ty->getAs()) + Ty = AtomicTy->getValueType(); if (Ty->isExtVectorBoolType()) { llvm::Type *StoreTy = convertTypeForLoadStore(Ty, Value->getType()); @@ -2088,6 +2085,12 @@ llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) { Value = Builder.CreateBitCast(Value, StoreTy); } + if (Ty->hasBooleanRepresentation() || Ty->isBitIntType()) { + llvm::Type *StoreTy = convertTypeForLoadStore(Ty, Value->getType()); + bool Signed = Ty->isSignedIntegerOrEnumerationType(); + return Builder.CreateIntCast(Value, StoreTy, Signed, "storedv"); + } + return Value; } @@ -2095,6 +2098,9 @@ llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) { /// by convertTypeForLoadStore) to its primary IR type (as returned /// by ConvertType). llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) { + if (auto *AtomicTy = Ty->getAs()) + Ty = AtomicTy->getValueType(); + if (Ty->isPackedVectorBoolType(getContext())) { const auto *RawIntTy = Value->getType(); From 141c14c9522ba2bf7472d660d64928b9982b5f6e Mon Sep 17 00:00:00 2001 From: tangaac Date: Wed, 23 Apr 2025 12:48:29 +0800 Subject: [PATCH 010/245] [LoongArch] Pre-commit for widen shuffle mask (#136544) --- .../LoongArch/lasx/widen-shuffle-mask.ll | 137 ++++++++++++++++++ .../LoongArch/lsx/widen-shuffle-mask.ll | 137 ++++++++++++++++++ 2 files changed, 274 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/lasx/widen-shuffle-mask.ll create mode 100644 llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll diff --git a/llvm/test/CodeGen/LoongArch/lasx/widen-shuffle-mask.ll b/llvm/test/CodeGen/LoongArch/lasx/widen-shuffle-mask.ll new file mode 100644 index 0000000000000..c32a60622f2a1 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/widen-shuffle-mask.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define <32 x i8> @widen_shuffle_mask_v32i8_to_v16i16(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v32i8_to_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI0_0) +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: ret + %r = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %r +} + +define <32 x i8> @widen_shuffle_mask_v32i8_to_v8i32(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v32i8_to_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI1_0) +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: ret + %r = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %r +} + +define <32 x i8> @widen_shuffle_mask_v32i8_to_v4i64(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v32i8_to_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI2_0) +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: ret + %r = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %r +} + +define <16 x i16> @widen_shuffle_mask_v16i16_to_v8i32(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i16_to_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI3_0) +; CHECK-NEXT: xvshuf.h $xr2, $xr1, $xr0 +; CHECK-NEXT: xvori.b $xr0, $xr2, 0 +; CHECK-NEXT: ret + %r = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %r +} + +define <16 x i16> @widen_shuffle_mask_v16i16_to_v4i64(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i16_to_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI4_0) +; CHECK-NEXT: xvshuf.h $xr2, $xr1, $xr0 +; CHECK-NEXT: xvori.b $xr0, $xr2, 0 +; CHECK-NEXT: ret + %r = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %r +} + +define <8 x i32> @widen_shuffle_mask_v8i32_to_v4i64(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: widen_shuffle_mask_v8i32_to_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI5_0) +; CHECK-NEXT: xvshuf.w $xr2, $xr1, $xr0 +; CHECK-NEXT: xvori.b $xr0, $xr2, 0 +; CHECK-NEXT: ret + %r = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %r +} + +define <32 x i8> @widen_shuffle_mask_v32i8_to_xvpackev_h(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v32i8_to_xvpackev_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI6_0) +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: ret + %r = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %r +} + +define <32 x i8> @widen_shuffle_mask_v32i8_to_xvpackod_h(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v32i8_to_xvpackod_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI7_0) +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: ret + %r = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %r +} + +define <32 x i8> @widen_shuffle_mask_v32i8_to_xvpickev_h(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v32i8_to_xvpickev_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI8_0) +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: ret + %r = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %r +} + +define <32 x i8> @widen_shuffle_mask_v32i8_to_xvpickod_h(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v32i8_to_xvpickod_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI9_0) +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: ret + %r = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %r +} + +define <32 x i8> @widen_shuffle_mask_v32i8_to_xvilvl_h(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v32i8_to_xvilvl_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI10_0) +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: ret + %r = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %r +} + +define <32 x i8> @widen_shuffle_mask_v32i8_to_xvilvh_h(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v32i8_to_xvilvh_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0) +; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI11_0) +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: ret + %r = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %r +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll b/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll new file mode 100644 index 0000000000000..35457ffa59586 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define <16 x i8> @widen_shuffle_mask_v16i8_to_v8i16(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i8_to_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI0_0) +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @widen_shuffle_mask_v16i8_to_v4i32(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i8_to_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI1_0) +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @widen_shuffle_mask_v16i8_to_v2i64(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i8_to_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI2_0) +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %r +} + +define <8 x i16> @widen_shuffle_mask_v8i16_to_v4i32(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: widen_shuffle_mask_v8i16_to_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI3_0) +; CHECK-NEXT: vshuf.h $vr2, $vr1, $vr0 +; CHECK-NEXT: vori.b $vr0, $vr2, 0 +; CHECK-NEXT: ret + %r = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %r +} + +define <8 x i16> @widen_shuffle_mask_v8i16_to_v2i64(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: widen_shuffle_mask_v8i16_to_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI4_0) +; CHECK-NEXT: vshuf.h $vr2, $vr1, $vr0 +; CHECK-NEXT: vori.b $vr0, $vr2, 0 +; CHECK-NEXT: ret + %r = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %r +} + +define <4 x i32> @widen_shuffle_mask_v4i32_to_v2i64(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: widen_shuffle_mask_v4i32_to_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI5_0) +; CHECK-NEXT: vshuf.w $vr2, $vr1, $vr0 +; CHECK-NEXT: vori.b $vr0, $vr2, 0 +; CHECK-NEXT: ret + %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %r +} + +define <16 x i8> @widen_shuffle_mask_v16i8_to_vpackev_h(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i8_to_vpackev_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI6_0) +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @widen_shuffle_mask_v16i8_to_vpackod_h(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i8_to_vpackod_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI7_0) +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @widen_shuffle_mask_v16i8_to_vpickev_h(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i8_to_vpickev_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI8_0) +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @widen_shuffle_mask_v16i8_to_vpickod_h(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i8_to_vpickod_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI9_0) +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @widen_shuffle_mask_v16i8_to_vilvl_h(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i8_to_vilvl_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI10_0) +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @widen_shuffle_mask_v16i8_to_vilvh_h(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: widen_shuffle_mask_v16i8_to_vilvh_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0) +; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI11_0) +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %r +} From 7547ad3a7bc1e249c240512438eb39581f58c8ef Mon Sep 17 00:00:00 2001 From: lntue Date: Wed, 23 Apr 2025 12:04:21 +0700 Subject: [PATCH 011/245] [libc][math] Skip checking for exceptional values in expm1f when LIBC_MATH_SKIP_ACCURATE_PASS is set. (#130968) --- libc/src/math/generic/expm1f.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp index 1e44e943d9258..b2967e2516197 100644 --- a/libc/src/math/generic/expm1f.cpp +++ b/libc/src/math/generic/expm1f.cpp @@ -30,6 +30,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { uint32_t x_u = xbits.uintval(); uint32_t x_abs = x_u & 0x7fff'ffffU; +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS // Exceptional value if (LIBC_UNLIKELY(x_u == 0x3e35'bec5U)) { // x = 0x1.6b7d8ap-3f int round_mode = fputil::quick_get_round(); @@ -37,7 +38,6 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { return 0x1.8dbe64p-3f; return 0x1.8dbe62p-3f; } - #if !defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE) if (LIBC_UNLIKELY(x_u == 0xbdc1'c6cbU)) { // x = -0x1.838d96p-4f int round_mode = fputil::quick_get_round(); @@ -46,6 +46,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { return -0x1.71c882p-4f; } #endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS // When |x| > 25*log(2), or nan if (LIBC_UNLIKELY(x_abs >= 0x418a'a123U)) { From 439f16a7e12f1aece321266e4fce760841bfcdf1 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Wed, 23 Apr 2025 07:11:25 +0200 Subject: [PATCH 012/245] [mlir][bazel] Port e112dccc8ba49425c575a6b15325f2cbeef5c606. --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index f79c3656dda92..8a85c6fffd628 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6271,6 +6271,7 @@ td_library( ":BuiltinDialectTdFiles", ":DataLayoutInterfacesTdFiles", ":OpBaseTdFiles", + ":ViewLikeInterfaceTdFiles", ], ) @@ -6364,6 +6365,7 @@ cc_library( ":PtrMemorySpaceInterfacesIncGen", ":PtrOpsEnumsGen", ":PtrOpsIncGen", + ":ViewLikeInterface", "//llvm:Support", ], ) From 3ccfbc8a002e1e0f64b5408d26bc42282afc194b Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Tue, 22 Apr 2025 22:38:28 -0700 Subject: [PATCH 013/245] [lldb] Make sure changing the separator takes immediate effect (#136779) The setter is only used when changing the setting programmatically. When using the settings command, we need to monitor SetPropertyValue. --- lldb/source/Core/Debugger.cpp | 4 +++- lldb/test/API/functionalities/statusline/TestStatusline.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index b572210f25603..cd8726eeba632 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -257,7 +257,9 @@ Status Debugger::SetPropertyValue(const ExecutionContext *exe_ctx, else m_statusline.reset(); } else if (property_path == - g_debugger_properties[ePropertyStatuslineFormat].name) { + g_debugger_properties[ePropertyStatuslineFormat].name || + property_path == + g_debugger_properties[ePropertySeparator].name) { // Statusline format changed. Redraw the statusline. RedrawStatusline(); } else if (property_path == diff --git a/lldb/test/API/functionalities/statusline/TestStatusline.py b/lldb/test/API/functionalities/statusline/TestStatusline.py index dcededdb11e39..da6b4e7c8f320 100644 --- a/lldb/test/API/functionalities/statusline/TestStatusline.py +++ b/lldb/test/API/functionalities/statusline/TestStatusline.py @@ -46,8 +46,10 @@ def test(self): self.child.expect(re.escape("a.out | main.c:2:11 | bre")) self.child.setwinsize(terminal_height, terminal_width) + # Change the separator. + self.expect('set set separator "S "', ["a.out S main.c:2:11"]) + # Change the format. - self.expect('set set separator "S"') self.expect( 'set set statusline-format "target = {${target.file.basename}} ${separator}"', ["target = a.out S"], From 7b6801574d978ef418dd76257478cbbe5866b09f Mon Sep 17 00:00:00 2001 From: Henrich Lauko Date: Wed, 23 Apr 2025 07:53:07 +0200 Subject: [PATCH 014/245] [CIR] Infer MLIRContext in attr builders when possible (#136741) Mirrors incubator changes from https://github.com/llvm/clangir/pull/1582 --- .../clang/CIR/Dialect/Builder/CIRBaseBuilder.h | 3 +-- clang/include/clang/CIR/Dialect/IR/CIRAttrs.td | 11 ++++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index 539268c6270f4..ef29791ed2783 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -166,8 +166,7 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { mlir::TypedAttr getConstPtrAttr(mlir::Type type, int64_t value) { auto valueAttr = mlir::IntegerAttr::get( mlir::IntegerType::get(type.getContext(), 64), value); - return cir::ConstPtrAttr::get( - getContext(), mlir::cast(type), valueAttr); + return cir::ConstPtrAttr::get(type, valueAttr); } mlir::Value createAlloca(mlir::Location loc, cir::PointerType addrType, diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td index 293b83e70eff7..cce63c5cae608 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td @@ -202,6 +202,11 @@ def ConstArrayAttr : CIR_Attr<"ConstArray", "const_array", [TypedAttrInterface]> zeros = typeSize - mlir::cast(elts).size(); return $_get(type.getContext(), type, elts, zeros); + }]>, + AttrBuilderWithInferredContext<(ins "cir::ArrayType":$type, + "mlir::Attribute":$elts, + "int":$trailingZerosNum), [{ + return $_get(type.getContext(), type, elts, trailingZerosNum); }]> ]; @@ -234,11 +239,7 @@ def ConstPtrAttr : CIR_Attr<"ConstPtr", "ptr", [TypedAttrInterface]> { "mlir::IntegerAttr":$value), [{ return $_get(type.getContext(), mlir::cast(type), value); - }]>, - AttrBuilder<(ins "mlir::Type":$type, - "mlir::IntegerAttr":$value), [{ - return $_get($_ctxt, mlir::cast(type), value); - }]>, + }]> ]; let extraClassDeclaration = [{ bool isNullValue() const { return getValue().getInt() == 0; } From 5080a0251fe3352d26560075a9b3b8c9acb13d23 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Wed, 23 Apr 2025 08:54:10 +0300 Subject: [PATCH 015/245] [CodeGenPrepare] Unfold slow ctpop when used in power-of-two test (#102731) DAG combiner already does this transformation, but in some cases it does not have a chance because either CodeGenPrepare or SelectionDAGBuilder move icmp to a different basic block. https://alive2.llvm.org/ce/z/ARzh99 Fixes #94829 Pull Request: https://github.com/llvm/llvm-project/pull/102731 --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 101 ++++-- .../PowerPC/vector-popcnt-128-ult-ugt.ll | 16 +- llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll | 205 +++---------- llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll | 28 +- llvm/test/CodeGen/RISCV/pr101786.ll | 32 ++ llvm/test/CodeGen/RISCV/rv32zbb.ll | 96 +++--- llvm/test/CodeGen/RISCV/rv64zbb.ll | 48 +-- llvm/test/CodeGen/X86/ispow2.ll | 44 +-- llvm/test/CodeGen/X86/pr94829.ll | 32 ++ llvm/test/CodeGen/X86/vector-popcnt-128.ll | 34 +-- .../CodeGen/X86/vector-popcnt-256-ult-ugt.ll | 136 ++++----- llvm/test/CodeGen/X86/vector-popcnt-256.ll | 288 +++++++++--------- .../CodeGen/X86/vector-popcnt-512-ult-ugt.ll | 80 ++--- llvm/test/CodeGen/X86/vector-popcnt-512.ll | 112 +++---- .../CodeGenPrepare/unfold-pow2-test-vec.ll | 85 ++++++ .../CodeGenPrepare/unfold-pow2-test.ll | 123 ++++++++ 16 files changed, 814 insertions(+), 646 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/pr101786.ll create mode 100644 llvm/test/CodeGen/X86/pr94829.ll create mode 100644 llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll create mode 100644 llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 12a668507fe65..e8dc7752b23c0 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -474,6 +474,7 @@ class CodeGenPrepare { bool optimizeURem(Instruction *Rem); bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT); bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT); + bool unfoldPowerOf2Test(CmpInst *Cmp); void verifyBFIUpdates(Function &F); bool _run(Function &F); }; @@ -1762,6 +1763,75 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, return true; } +// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow. +// The same transformation exists in DAG combiner, but we repeat it here because +// DAG builder can break the pattern by moving icmp into a successor block. +bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) { + CmpPredicate Pred; + Value *X; + const APInt *C; + + // (icmp (ctpop x), c) + if (!match(Cmp, m_ICmp(Pred, m_Intrinsic(m_Value(X)), + m_APIntAllowPoison(C)))) + return false; + + // We're only interested in "is power of 2 [or zero]" patterns. + bool IsStrictlyPowerOf2Test = ICmpInst::isEquality(Pred) && *C == 1; + bool IsPowerOf2OrZeroTest = (Pred == CmpInst::ICMP_ULT && *C == 2) || + (Pred == CmpInst::ICMP_UGT && *C == 1); + if (!IsStrictlyPowerOf2Test && !IsPowerOf2OrZeroTest) + return false; + + // Some targets have better codegen for `ctpop(x) u= 2/1`than for + // `ctpop(x) ==/!= 1`. If ctpop is fast, only try changing the comparison, + // and otherwise expand ctpop into a few simple instructions. + Type *OpTy = X->getType(); + if (TLI->isCtpopFast(TLI->getValueType(*DL, OpTy))) { + // Look for `ctpop(x) ==/!= 1`, where `ctpop(x)` is known to be non-zero. + if (!IsStrictlyPowerOf2Test || !isKnownNonZero(Cmp->getOperand(0), *DL)) + return false; + + // ctpop(x) == 1 -> ctpop(x) u< 2 + // ctpop(x) != 1 -> ctpop(x) u> 1 + if (Pred == ICmpInst::ICMP_EQ) { + Cmp->setOperand(1, ConstantInt::get(OpTy, 2)); + Cmp->setPredicate(ICmpInst::ICMP_ULT); + } else { + Cmp->setPredicate(ICmpInst::ICMP_UGT); + } + return true; + } + + Value *NewCmp; + if (IsPowerOf2OrZeroTest || + (IsStrictlyPowerOf2Test && isKnownNonZero(Cmp->getOperand(0), *DL))) { + // ctpop(x) u< 2 -> (x & (x - 1)) == 0 + // ctpop(x) u> 1 -> (x & (x - 1)) != 0 + IRBuilder<> Builder(Cmp); + Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy)); + Value *And = Builder.CreateAnd(X, Sub); + CmpInst::Predicate NewPred = + (Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_EQ) + ? CmpInst::ICMP_EQ + : CmpInst::ICMP_NE; + NewCmp = Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(OpTy)); + } else { + // ctpop(x) == 1 -> (x ^ (x - 1)) u> (x - 1) + // ctpop(x) != 1 -> (x ^ (x - 1)) u<= (x - 1) + IRBuilder<> Builder(Cmp); + Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy)); + Value *Xor = Builder.CreateXor(X, Sub); + CmpInst::Predicate NewPred = + Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT : CmpInst::ICMP_ULE; + NewCmp = Builder.CreateICmp(NewPred, Xor, Sub); + } + + Cmp->replaceAllUsesWith(NewCmp); + RecursivelyDeleteTriviallyDeadInstructions(Cmp); + return true; +} + /// Sink the given CmpInst into user blocks to reduce the number of virtual /// registers that must be created and coalesced. This is a clear win except on /// targets with multiple condition code registers (PowerPC), where it might @@ -2148,31 +2218,6 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) { return false; } -/// Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`. -/// This function converts `ctpop(X) ==/!= 1` into `ctpop(X) u 2/1` if the -/// result cannot be zero. -static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI, - const TargetTransformInfo &TTI, - const DataLayout &DL) { - CmpPredicate Pred; - if (!match(Cmp, m_ICmp(Pred, m_Intrinsic(), m_One()))) - return false; - if (!ICmpInst::isEquality(Pred)) - return false; - auto *II = cast(Cmp->getOperand(0)); - - if (isKnownNonZero(II, DL)) { - if (Pred == ICmpInst::ICMP_EQ) { - Cmp->setOperand(1, ConstantInt::get(II->getType(), 2)); - Cmp->setPredicate(ICmpInst::ICMP_ULT); - } else { - Cmp->setPredicate(ICmpInst::ICMP_UGT); - } - return true; - } - return false; -} - bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (sinkCmpExpression(Cmp, *TLI)) return true; @@ -2183,6 +2228,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (combineToUSubWithOverflow(Cmp, ModifiedDT)) return true; + if (unfoldPowerOf2Test(Cmp)) + return true; + if (foldICmpWithDominatingICmp(Cmp, *TLI)) return true; @@ -2192,9 +2240,6 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (foldFCmpToFPClassTest(Cmp, *TLI, *DL)) return true; - if (adjustIsPower2Test(Cmp, *TLI, *TTI, *DL)) - return true; - return false; } diff --git a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll index ff7f1fc902981..04351346745b3 100644 --- a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll @@ -11945,11 +11945,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ugt_1_v2i64: ; PWR5: # %bb.0: ; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: addi 6, 4, -1 ; PWR5-NEXT: and 3, 3, 5 -; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 6 ; PWR5-NEXT: subfic 3, 3, 0 ; PWR5-NEXT: subfe 3, 3, 3 -; PWR5-NEXT: and 4, 4, 5 ; PWR5-NEXT: subfic 4, 4, 0 ; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr @@ -11957,11 +11957,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) { ; PWR6-LABEL: ugt_1_v2i64: ; PWR6: # %bb.0: ; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: addi 6, 4, -1 ; PWR6-NEXT: and 3, 3, 5 -; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 6 ; PWR6-NEXT: subfic 3, 3, 0 ; PWR6-NEXT: subfe 3, 3, 3 -; PWR6-NEXT: and 4, 4, 5 ; PWR6-NEXT: subfic 4, 4, 0 ; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr @@ -12016,11 +12016,11 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ult_2_v2i64: ; PWR5: # %bb.0: ; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: addi 6, 4, -1 ; PWR5-NEXT: and 3, 3, 5 -; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 6 ; PWR5-NEXT: addic 3, 3, -1 ; PWR5-NEXT: subfe 3, 3, 3 -; PWR5-NEXT: and 4, 4, 5 ; PWR5-NEXT: addic 4, 4, -1 ; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr @@ -12028,11 +12028,11 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) { ; PWR6-LABEL: ult_2_v2i64: ; PWR6: # %bb.0: ; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: addi 6, 4, -1 ; PWR6-NEXT: and 3, 3, 5 -; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 6 ; PWR6-NEXT: addic 3, 3, -1 ; PWR6-NEXT: subfe 3, 3, 3 -; PWR6-NEXT: and 4, 4, 5 ; PWR6-NEXT: addic 4, 4, -1 ; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll index 95af7861d4798..f9af74d6ec323 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll @@ -357,49 +357,14 @@ define i64 @ctpop_i64(i64 %a) nounwind { define i1 @ctpop_i64_ugt_two(i64 %a) nounwind { ; RV32I-LABEL: ctpop_i64_ugt_two: ; RV32I: # %bb.0: -; RV32I-NEXT: j .LBB6_2 -; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltiu a0, zero, 0 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB6_2: -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: addi a5, a5, -241 -; RV32I-NEXT: sub a0, a0, a2 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: srli a3, a1, 4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: slli a3, a1, 8 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: slli a2, a0, 16 -; RV32I-NEXT: slli a3, a1, 16 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: sltiu a0, a0, 2 +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: addi a3, a1, -1 +; RV32I-NEXT: sltiu a4, a2, -1 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: seqz a0, a0 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_i64_ugt_two: @@ -422,50 +387,14 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind { define i1 @ctpop_i64_ugt_one(i64 %a) nounwind { ; RV32I-LABEL: ctpop_i64_ugt_one: ; RV32I: # %bb.0: -; RV32I-NEXT: j .LBB7_2 -; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: snez a0, zero -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: addi a5, a5, -241 -; RV32I-NEXT: sub a0, a0, a2 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: srli a3, a1, 4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: slli a3, a1, 8 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: slli a2, a0, 16 -; RV32I-NEXT: slli a3, a1, 16 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: sltiu a0, a0, 2 -; RV32I-NEXT: xori a0, a0, 1 +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: addi a3, a1, -1 +; RV32I-NEXT: sltiu a4, a2, -1 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: snez a0, a0 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_i64_ugt_one: @@ -489,45 +418,18 @@ define i1 @ctpop_i64_ugt_one(i64 %a) nounwind { define i1 @ctpop_i64_eq_one(i64 %a) nounwind { ; RV32I-LABEL: ctpop_i64_eq_one: ; RV32I: # %bb.0: -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: addi a5, a5, -241 -; RV32I-NEXT: sub a0, a0, a2 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: srli a3, a1, 4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: slli a3, a1, 8 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: slli a2, a0, 16 -; RV32I-NEXT: slli a3, a1, 16 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: xori a0, a0, 1 -; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: sltiu a3, a2, -1 +; RV32I-NEXT: addi a4, a1, -1 +; RV32I-NEXT: add a3, a4, a3 +; RV32I-NEXT: xor a1, a1, a3 +; RV32I-NEXT: beq a1, a3, .LBB8_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a0, a3, a1 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB8_2: +; RV32I-NEXT: xor a0, a0, a2 +; RV32I-NEXT: sltu a0, a2, a0 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_i64_eq_one: @@ -546,45 +448,20 @@ define i1 @ctpop_i64_eq_one(i64 %a) nounwind { define i1 @ctpop_i64_ne_one(i64 %a) nounwind { ; RV32I-LABEL: ctpop_i64_ne_one: ; RV32I: # %bb.0: -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: addi a5, a5, -241 -; RV32I-NEXT: sub a0, a0, a2 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: srli a3, a1, 4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: slli a3, a1, 8 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: slli a2, a0, 16 -; RV32I-NEXT: slli a3, a1, 16 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: sltiu a3, a2, -1 +; RV32I-NEXT: addi a4, a1, -1 +; RV32I-NEXT: add a3, a4, a3 +; RV32I-NEXT: xor a1, a1, a3 +; RV32I-NEXT: beq a1, a3, .LBB9_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a0, a3, a1 +; RV32I-NEXT: xori a0, a0, 1 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB9_2: +; RV32I-NEXT: xor a0, a0, a2 +; RV32I-NEXT: sltu a0, a2, a0 ; RV32I-NEXT: xori a0, a0, 1 -; RV32I-NEXT: snez a0, a0 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_i64_ne_one: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll index 9a6c718703a27..8549a7c526e45 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll @@ -701,31 +701,9 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind { define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind { ; RV64I-LABEL: ctpop_i32_ult_two: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: srliw a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: sraiw a1, a0, 4 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a2, a2, -241 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: sltiu a0, a0, 2 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: addiw a1, a0, -1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_i32_ult_two: diff --git a/llvm/test/CodeGen/RISCV/pr101786.ll b/llvm/test/CodeGen/RISCV/pr101786.ll new file mode 100644 index 0000000000000..6d0736edd3e89 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/pr101786.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=riscv64 -o - %s | FileCheck %s + +define i64 @test(i64 %x, ptr %p) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: bgtz a2, .LBB0_3 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: addi a3, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: bnez a2, .LBB0_3 +; CHECK-NEXT: # %bb.2: # %if.else +; CHECK-NEXT: ld a0, 0(a1) +; CHECK-NEXT: .LBB0_3: # %if.end +; CHECK-NEXT: ret +entry: + %ctpop = call i64 @llvm.ctpop.i64(i64 %x) + %cmp1 = icmp ugt i64 %ctpop, 1 + %cmp2 = icmp sgt i64 %x, 0 + %or = or i1 %cmp2, %cmp1 + br i1 %or, label %if.end, label %if.else + +if.else: + %load = load i64, ptr %p, align 8 + br label %if.end + +if.end: + %res = phi i64 [0, %entry], [%load, %if.else] + ret i64 %res +} diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 1b9b1b89aeb7e..98c86da41afa1 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -416,9 +416,9 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind { ; RV32I-LABEL: ctpop_v2i32_ult_two: ; RV32I: # %bb.0: ; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: addi a3, a1, -1 +; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: addi a2, a1, -1 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: seqz a0, a0 ; RV32I-NEXT: seqz a1, a1 ; RV32I-NEXT: ret @@ -439,9 +439,9 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind { ; RV32I-LABEL: ctpop_v2i32_ugt_one: ; RV32I: # %bb.0: ; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: addi a3, a1, -1 +; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: addi a2, a1, -1 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: snez a0, a0 ; RV32I-NEXT: snez a1, a1 ; RV32I-NEXT: ret @@ -464,11 +464,11 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind { ; RV32I-LABEL: ctpop_v2i32_eq_one: ; RV32I: # %bb.0: ; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: addi a3, a1, -1 +; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: xor a0, a0, a2 ; RV32I-NEXT: sltu a0, a2, a0 -; RV32I-NEXT: addi a2, a1, -1 -; RV32I-NEXT: xor a1, a1, a2 -; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: sltu a1, a3, a1 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_v2i32_eq_one: @@ -489,11 +489,11 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind { ; RV32I-LABEL: ctpop_v2i32_ne_one: ; RV32I: # %bb.0: ; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: addi a3, a1, -1 +; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: xor a0, a0, a2 ; RV32I-NEXT: sltu a0, a2, a0 -; RV32I-NEXT: addi a2, a1, -1 -; RV32I-NEXT: xor a1, a1, a2 -; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: sltu a1, a3, a1 ; RV32I-NEXT: xori a0, a0, 1 ; RV32I-NEXT: xori a1, a1, 1 ; RV32I-NEXT: ret @@ -571,12 +571,12 @@ define i64 @ctpop_i64(i64 %a) nounwind { define i1 @ctpop_i64_ugt_two(i64 %a) nounwind { ; RV32I-LABEL: ctpop_i64_ugt_two: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a2, a0, -1 -; RV32I-NEXT: and a2, a0, a2 -; RV32I-NEXT: seqz a0, a0 -; RV32I-NEXT: sub a0, a1, a0 -; RV32I-NEXT: and a0, a1, a0 -; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: seqz a2, a0 +; RV32I-NEXT: addi a3, a0, -1 +; RV32I-NEXT: sub a2, a1, a2 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: seqz a0, a0 ; RV32I-NEXT: ret ; @@ -595,12 +595,12 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind { define i1 @ctpop_i64_ugt_one(i64 %a) nounwind { ; RV32I-LABEL: ctpop_i64_ugt_one: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a2, a0, -1 -; RV32I-NEXT: and a2, a0, a2 -; RV32I-NEXT: seqz a0, a0 -; RV32I-NEXT: sub a0, a1, a0 -; RV32I-NEXT: and a0, a1, a0 -; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: seqz a2, a0 +; RV32I-NEXT: addi a3, a0, -1 +; RV32I-NEXT: sub a2, a1, a2 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: snez a0, a0 ; RV32I-NEXT: ret ; @@ -785,20 +785,20 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind { ; RV32I-LABEL: ctpop_v2i64_ult_two: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a1, 0(a0) -; RV32I-NEXT: lw a2, 4(a0) -; RV32I-NEXT: lw a3, 8(a0) +; RV32I-NEXT: lw a2, 8(a0) +; RV32I-NEXT: lw a3, 4(a0) ; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: addi a4, a1, -1 -; RV32I-NEXT: and a4, a1, a4 -; RV32I-NEXT: seqz a1, a1 -; RV32I-NEXT: sub a1, a2, a1 -; RV32I-NEXT: and a1, a2, a1 -; RV32I-NEXT: addi a2, a3, -1 -; RV32I-NEXT: and a2, a3, a2 -; RV32I-NEXT: seqz a3, a3 -; RV32I-NEXT: sub a3, a0, a3 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: seqz a4, a1 +; RV32I-NEXT: seqz a5, a2 +; RV32I-NEXT: addi a6, a1, -1 +; RV32I-NEXT: addi a7, a2, -1 +; RV32I-NEXT: sub a4, a3, a4 +; RV32I-NEXT: sub a5, a0, a5 +; RV32I-NEXT: and a2, a2, a7 +; RV32I-NEXT: and a1, a1, a6 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: or a2, a2, a0 ; RV32I-NEXT: seqz a0, a1 ; RV32I-NEXT: seqz a1, a2 @@ -828,20 +828,20 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind { ; RV32I-LABEL: ctpop_v2i64_ugt_one: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a1, 0(a0) -; RV32I-NEXT: lw a2, 4(a0) -; RV32I-NEXT: lw a3, 8(a0) +; RV32I-NEXT: lw a2, 8(a0) +; RV32I-NEXT: lw a3, 4(a0) ; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: addi a4, a1, -1 -; RV32I-NEXT: and a4, a1, a4 -; RV32I-NEXT: seqz a1, a1 -; RV32I-NEXT: sub a1, a2, a1 -; RV32I-NEXT: and a1, a2, a1 -; RV32I-NEXT: addi a2, a3, -1 -; RV32I-NEXT: and a2, a3, a2 -; RV32I-NEXT: seqz a3, a3 -; RV32I-NEXT: sub a3, a0, a3 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: seqz a4, a1 +; RV32I-NEXT: seqz a5, a2 +; RV32I-NEXT: addi a6, a1, -1 +; RV32I-NEXT: addi a7, a2, -1 +; RV32I-NEXT: sub a4, a3, a4 +; RV32I-NEXT: sub a5, a0, a5 +; RV32I-NEXT: and a2, a2, a7 +; RV32I-NEXT: and a1, a1, a6 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: or a2, a2, a0 ; RV32I-NEXT: snez a0, a1 ; RV32I-NEXT: snez a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index 25325ad7d50a4..17eb0817d548a 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -719,11 +719,11 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind { ; RV64I-LABEL: ctpop_v2i32_ult_two: ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: addi a3, a1, -1 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: addi a2, a1, -1 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: seqz a1, a1 ; RV64I-NEXT: ret @@ -744,11 +744,11 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind { ; RV64I-LABEL: ctpop_v2i32_ugt_one: ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: addi a3, a1, -1 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: addi a2, a1, -1 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: snez a1, a1 ; RV64I-NEXT: ret @@ -771,13 +771,13 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind { ; RV64I-LABEL: ctpop_v2i32_eq_one: ; RV64I: # %bb.0: ; RV64I-NEXT: addiw a2, a0, -1 +; RV64I-NEXT: addiw a3, a1, -1 +; RV64I-NEXT: xor a1, a1, a3 ; RV64I-NEXT: xor a0, a0, a2 +; RV64I-NEXT: sext.w a1, a1 ; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: sltu a0, a2, a0 -; RV64I-NEXT: addiw a2, a1, -1 -; RV64I-NEXT: xor a1, a1, a2 -; RV64I-NEXT: sext.w a1, a1 -; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: sltu a1, a3, a1 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_v2i32_eq_one: @@ -798,13 +798,13 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind { ; RV64I-LABEL: ctpop_v2i32_ne_one: ; RV64I: # %bb.0: ; RV64I-NEXT: addiw a2, a0, -1 +; RV64I-NEXT: addiw a3, a1, -1 +; RV64I-NEXT: xor a1, a1, a3 ; RV64I-NEXT: xor a0, a0, a2 +; RV64I-NEXT: sext.w a1, a1 ; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: sltu a0, a2, a0 -; RV64I-NEXT: addiw a2, a1, -1 -; RV64I-NEXT: xor a1, a1, a2 -; RV64I-NEXT: sext.w a1, a1 -; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: sltu a1, a3, a1 ; RV64I-NEXT: xori a0, a0, 1 ; RV64I-NEXT: xori a1, a1, 1 ; RV64I-NEXT: ret @@ -1009,9 +1009,9 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind { ; RV64I-LABEL: ctpop_v2i64_ult_two: ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: addi a3, a1, -1 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: addi a2, a1, -1 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: seqz a1, a1 ; RV64I-NEXT: ret @@ -1032,9 +1032,9 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind { ; RV64I-LABEL: ctpop_v2i64_ugt_one: ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: addi a3, a1, -1 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: addi a2, a1, -1 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: snez a1, a1 ; RV64I-NEXT: ret @@ -1057,11 +1057,11 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind { ; RV64I-LABEL: ctpop_v2i64_eq_one: ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: addi a3, a1, -1 +; RV64I-NEXT: xor a1, a1, a3 ; RV64I-NEXT: xor a0, a0, a2 ; RV64I-NEXT: sltu a0, a2, a0 -; RV64I-NEXT: addi a2, a1, -1 -; RV64I-NEXT: xor a1, a1, a2 -; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: sltu a1, a3, a1 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_v2i64_eq_one: @@ -1082,11 +1082,11 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind { ; RV64I-LABEL: ctpop_v2i64_ne_one: ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: addi a3, a1, -1 +; RV64I-NEXT: xor a1, a1, a3 ; RV64I-NEXT: xor a0, a0, a2 ; RV64I-NEXT: sltu a0, a2, a0 -; RV64I-NEXT: addi a2, a1, -1 -; RV64I-NEXT: xor a1, a1, a2 -; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: sltu a1, a3, a1 ; RV64I-NEXT: xori a0, a0, 1 ; RV64I-NEXT: xori a1, a1, 1 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll index 649d257b28d76..badfd1af940ca 100644 --- a/llvm/test/CodeGen/X86/ispow2.ll +++ b/llvm/test/CodeGen/X86/ispow2.ll @@ -72,11 +72,11 @@ define <4 x i1> @is_pow2_non_zero_4xv64(<4 x i64> %xin) { ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2 ; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 +; CHECK-NOBMI-NEXT: paddq %xmm0, %xmm2 +; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3 ; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3 -; CHECK-NOBMI-NEXT: paddq %xmm0, %xmm2 -; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] @@ -122,12 +122,12 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) { ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2 ; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 -; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3 -; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1 -; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3 ; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm4 ; CHECK-NOBMI-NEXT: pand %xmm4, %xmm0 +; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3 +; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1 +; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] @@ -170,27 +170,27 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) { ; CHECK-NOBMI-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 -; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm3 -; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm1 -; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm5 +; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm4 +; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm4 +; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; CHECK-NOBMI-NEXT: pxor %xmm5, %xmm4 +; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm1 +; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm6 +; CHECK-NOBMI-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NOBMI-NEXT: pxor %xmm5, %xmm3 +; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm0 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm5 ; CHECK-NOBMI-NEXT: pcmpgtd %xmm3, %xmm5 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm6 -; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm6 -; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm6 -; CHECK-NOBMI-NEXT: pxor %xmm6, %xmm0 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4 -; CHECK-NOBMI-NEXT: pcmpgtd %xmm6, %xmm4 -; CHECK-NOBMI-NEXT: movdqa %xmm4, %xmm7 -; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm5[0,2] -; CHECK-NOBMI-NEXT: pcmpeqd %xmm3, %xmm1 -; CHECK-NOBMI-NEXT: pcmpeqd %xmm6, %xmm0 +; CHECK-NOBMI-NEXT: movdqa %xmm5, %xmm7 +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2] +; CHECK-NOBMI-NEXT: pcmpeqd %xmm4, %xmm1 +; CHECK-NOBMI-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; CHECK-NOBMI-NEXT: andps %xmm7, %xmm0 -; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3] -; CHECK-NOBMI-NEXT: orps %xmm4, %xmm0 +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] +; CHECK-NOBMI-NEXT: orps %xmm5, %xmm0 ; CHECK-NOBMI-NEXT: xorps %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr94829.ll b/llvm/test/CodeGen/X86/pr94829.ll new file mode 100644 index 0000000000000..b858c636cebd8 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr94829.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=x86_64 -o - %s | FileCheck %s + +define i64 @test(i64 %x, i64 %y, i64 %a, i64 %b) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: jg .LBB0_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: leaq -1(%rdi), %rcx +; CHECK-NEXT: andq %rdi, %rcx +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: # %bb.3: # %if.end +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_2: # %if.then +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: retq +entry: + %ctpop = call i64 @llvm.ctpop.i64(i64 %x) + %cmp1 = icmp ugt i64 %ctpop, 1 + %cmp2 = icmp sgt i64 %y, 0 + %cmp = or i1 %cmp2, %cmp1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + br label %if.end + +if.end: + %res = phi i64 [ %a, %if.then ], [ %b, %entry ] + ret i64 %res +} diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll index 741d70a369022..c1d30b6d5a995 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -803,11 +803,10 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) { ; BITALG-LABEL: eq_1_v2i64: ; BITALG: # %bb.0: ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpminuq %xmm1, %xmm0, %xmm1 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpnleuq %xmm2, %xmm0, %k1 +; BITALG-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp eq <2 x i64> %2, @@ -883,10 +882,10 @@ define <2 x i64> @ne_1_v2i64(<2 x i64> %0) { ; BITALG-LABEL: ne_1_v2i64: ; BITALG: # %bb.0: ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpminuq %xmm1, %xmm0, %xmm1 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpleuq %xmm2, %xmm0, %k1 +; BITALG-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ne <2 x i64> %2, @@ -982,11 +981,10 @@ define <4 x i32> @eq_1_v4i32(<4 x i32> %0) { ; BITALG-LABEL: eq_1_v4i32: ; BITALG: # %bb.0: ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpminud %xmm1, %xmm0, %xmm1 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpnleud %xmm2, %xmm0, %k1 +; BITALG-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp eq <4 x i32> %2, @@ -1085,10 +1083,10 @@ define <4 x i32> @ne_1_v4i32(<4 x i32> %0) { ; BITALG-LABEL: ne_1_v4i32: ; BITALG: # %bb.0: ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpminud %xmm1, %xmm0, %xmm1 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpleud %xmm2, %xmm0, %k1 +; BITALG-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ne <4 x i32> %2, diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll index f72ad6d70522f..487f9a5d326cf 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -10,18 +10,18 @@ define <32 x i8> @ugt_1_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ugt_1_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v32i8: @@ -76,13 +76,13 @@ define <32 x i8> @ugt_1_v32i8(<32 x i8> %0) { define <32 x i8> @ult_2_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ult_2_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1003,18 +1003,18 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) { define <16 x i16> @ugt_1_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_1_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v16i16: @@ -1069,13 +1069,13 @@ define <16 x i16> @ugt_1_v16i16(<16 x i16> %0) { define <16 x i16> @ult_2_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_2_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -3302,18 +3302,18 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_1_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v8i32: @@ -3370,13 +3370,13 @@ define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) { define <8 x i32> @ult_2_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_2_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -9377,18 +9377,18 @@ define <8 x i32> @ult_31_v8i32(<8 x i32> %0) { define <4 x i64> @ugt_1_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_1_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v4i64: @@ -9445,13 +9445,13 @@ define <4 x i64> @ugt_1_v4i64(<4 x i64> %0) { define <4 x i64> @ult_2_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_2_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll index 701b9622089db..7fb60b987d95d 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -507,15 +507,15 @@ define <32 x i8> @foldv32i8() nounwind { define <4 x i64> @eq_1_v4i64(<4 x i64> %0) { ; AVX1-LABEL: eq_1_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 @@ -534,12 +534,12 @@ define <4 x i64> @eq_1_v4i64(<4 x i64> %0) { ; ; XOP-LABEL: eq_1_v4i64: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpcomgtuq %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 +; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; XOP-NEXT: vpcomgtuq %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpcomgtuq %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -574,11 +574,10 @@ define <4 x i64> @eq_1_v4i64(<4 x i64> %0) { ; BITALG-LABEL: eq_1_v4i64: ; BITALG: # %bb.0: ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm1 -; BITALG-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpminuq %ymm1, %ymm0, %ymm1 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0 +; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpnleuq %ymm2, %ymm0, %k1 +; BITALG-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp eq <4 x i64> %2, @@ -589,21 +588,21 @@ define <4 x i64> @eq_1_v4i64(<4 x i64> %0) { define <4 x i64> @ne_1_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ne_1_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ne_1_v4i64: @@ -619,12 +618,12 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) { ; ; XOP-LABEL: ne_1_v4i64: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpcomleuq %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 +; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; XOP-NEXT: vpcomleuq %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpcomleuq %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -660,10 +659,10 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) { ; BITALG-LABEL: ne_1_v4i64: ; BITALG: # %bb.0: ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm1 -; BITALG-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpminuq %ymm1, %ymm0, %ymm1 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpleuq %ymm2, %ymm0, %k1 +; BITALG-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ne <4 x i64> %2, @@ -674,19 +673,19 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) { define <8 x i32> @eq_1_v8i32(<8 x i32> %0) { ; AVX1-LABEL: eq_1_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: eq_1_v8i32: @@ -701,12 +700,12 @@ define <8 x i32> @eq_1_v8i32(<8 x i32> %0) { ; ; XOP-LABEL: eq_1_v8i32: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpcomgtud %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; XOP-NEXT: vpcomgtud %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpcomgtud %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -741,11 +740,10 @@ define <8 x i32> @eq_1_v8i32(<8 x i32> %0) { ; BITALG-LABEL: eq_1_v8i32: ; BITALG: # %bb.0: ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; BITALG-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpminud %ymm1, %ymm0, %ymm1 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0 +; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpnleud %ymm2, %ymm0, %k1 +; BITALG-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp eq <8 x i32> %2, @@ -756,13 +754,13 @@ define <8 x i32> @eq_1_v8i32(<8 x i32> %0) { define <8 x i32> @ne_1_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ne_1_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 @@ -780,12 +778,12 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) { ; ; XOP-LABEL: ne_1_v8i32: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpcomleud %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; XOP-NEXT: vpcomleud %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpcomleud %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -821,10 +819,10 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) { ; BITALG-LABEL: ne_1_v8i32: ; BITALG: # %bb.0: ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; BITALG-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpminud %ymm1, %ymm0, %ymm1 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpleud %ymm2, %ymm0, %k1 +; BITALG-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ne <8 x i32> %2, @@ -835,19 +833,19 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) { define <16 x i16> @eq_1_v16i16(<16 x i16> %0) { ; AVX1-LABEL: eq_1_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: eq_1_v16i16: @@ -862,12 +860,12 @@ define <16 x i16> @eq_1_v16i16(<16 x i16> %0) { ; ; XOP-LABEL: eq_1_v16i16: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpcomgtuw %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; XOP-NEXT: vpcomgtuw %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpcomgtuw %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -915,13 +913,13 @@ define <16 x i16> @eq_1_v16i16(<16 x i16> %0) { define <16 x i16> @ne_1_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ne_1_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpminuw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 @@ -939,12 +937,12 @@ define <16 x i16> @ne_1_v16i16(<16 x i16> %0) { ; ; XOP-LABEL: ne_1_v16i16: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpcomleuw %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; XOP-NEXT: vpcomleuw %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpcomleuw %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -992,19 +990,19 @@ define <16 x i16> @ne_1_v16i16(<16 x i16> %0) { define <32 x i8> @eq_1_v32i8(<32 x i8> %0) { ; AVX1-LABEL: eq_1_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpminub %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminub %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: eq_1_v32i8: @@ -1019,12 +1017,12 @@ define <32 x i8> @eq_1_v32i8(<32 x i8> %0) { ; ; XOP-LABEL: eq_1_v32i8: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpcomgtub %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm2 +; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; XOP-NEXT: vpcomgtub %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpcomgtub %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1072,13 +1070,13 @@ define <32 x i8> @eq_1_v32i8(<32 x i8> %0) { define <32 x i8> @ne_1_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ne_1_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpminub %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 @@ -1096,12 +1094,12 @@ define <32 x i8> @ne_1_v32i8(<32 x i8> %0) { ; ; XOP-LABEL: ne_1_v32i8: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpcomleub %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm2 +; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; XOP-NEXT: vpcomleub %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpcomleub %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll index 828c97de3a079..1618a647a4062 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll @@ -9,13 +9,13 @@ define <64 x i8> @ugt_1_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_1_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -32,13 +32,13 @@ define <64 x i8> @ugt_1_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_1_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -68,13 +68,13 @@ define <64 x i8> @ugt_1_v64i8(<64 x i8> %0) { define <64 x i8> @ult_2_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_2_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -90,13 +90,13 @@ define <64 x i8> @ult_2_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_2_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1035,13 +1035,13 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { define <32 x i16> @ugt_1_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_1_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1058,13 +1058,13 @@ define <32 x i16> @ugt_1_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_1_v32i16: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1094,13 +1094,13 @@ define <32 x i16> @ugt_1_v32i16(<32 x i16> %0) { define <32 x i16> @ult_2_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_2_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1116,13 +1116,13 @@ define <32 x i16> @ult_2_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_2_v32i16: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll index 0a5f16a0f635f..f470a2be8aee8 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll @@ -451,13 +451,13 @@ define <16 x i32> @ne_1_v16i32(<16 x i32> %0) { define <32 x i16> @eq_1_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: eq_1_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpminuw %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpxor %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpminuw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 @@ -476,13 +476,13 @@ define <32 x i16> @eq_1_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: eq_1_v32i16: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm3, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm1, %ymm3, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm2, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 @@ -514,13 +514,13 @@ define <32 x i16> @eq_1_v32i16(<32 x i16> %0) { define <32 x i16> @ne_1_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ne_1_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpminuw %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpxor %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpminuw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 @@ -538,13 +538,13 @@ define <32 x i16> @ne_1_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ne_1_v32i16: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm3, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm1, %ymm3, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm2, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 @@ -575,13 +575,13 @@ define <32 x i16> @ne_1_v32i16(<32 x i16> %0) { define <64 x i8> @eq_1_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: eq_1_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpminub %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpxor %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpminub %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpminub %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 @@ -600,13 +600,13 @@ define <64 x i8> @eq_1_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: eq_1_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm3, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm1, %ymm3, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm2, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 @@ -638,13 +638,13 @@ define <64 x i8> @eq_1_v64i8(<64 x i8> %0) { define <64 x i8> @ne_1_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ne_1_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpminub %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpxor %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpminub %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpminub %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 @@ -662,13 +662,13 @@ define <64 x i8> @ne_1_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ne_1_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm3, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm1, %ymm3, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm3, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm2, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll new file mode 100644 index 0000000000000..9e4a10d9eb864 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p 'require,function(codegenprepare)' -S %s \ +; RUN: | FileCheck %s --check-prefix=SLOW +; RUN: opt -p 'require,function(codegenprepare)' -S --mattr=+zvbb %s \ +; RUN: | FileCheck %s --check-prefix=FAST +; REQUIRES: riscv-registered-target + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64" + +define <4 x i1> @test_ult_2(<4 x i64> %x) { +; SLOW-LABEL: define <4 x i1> @test_ult_2( +; SLOW-SAME: <4 x i64> [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add <4 x i64> [[X]], splat (i64 -1) +; SLOW-NEXT: [[TMP1:%.*]] = and <4 x i64> [[X]], [[TMP0]] +; SLOW-NEXT: [[CMP1:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer +; SLOW-NEXT: ret <4 x i1> [[CMP1]] +; +; FAST-LABEL: define <4 x i1> @test_ult_2( +; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; FAST-NEXT: [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp ult <4 x i64> [[CTPOP]], splat (i64 2) +; FAST-NEXT: ret <4 x i1> [[CMP1]] +; + %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x) + %cmp = icmp ult <4 x i64> %ctpop, splat (i64 2) + ret <4 x i1> %cmp +} + +define <4 x i1> @test_ugt_1(<4 x i64> %x) { +; SLOW-LABEL: define <4 x i1> @test_ugt_1( +; SLOW-SAME: <4 x i64> [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add <4 x i64> [[X]], splat (i64 -1) +; SLOW-NEXT: [[TMP1:%.*]] = and <4 x i64> [[X]], [[TMP0]] +; SLOW-NEXT: [[CMP1:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer +; SLOW-NEXT: ret <4 x i1> [[CMP1]] +; +; FAST-LABEL: define <4 x i1> @test_ugt_1( +; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0]] { +; FAST-NEXT: [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp ugt <4 x i64> [[CTPOP]], splat (i64 1) +; FAST-NEXT: ret <4 x i1> [[CMP1]] +; + %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x) + %cmp = icmp ugt <4 x i64> %ctpop, splat (i64 1) + ret <4 x i1> %cmp +} + +define <4 x i1> @test_eq_1(<4 x i64> %x) { +; SLOW-LABEL: define <4 x i1> @test_eq_1( +; SLOW-SAME: <4 x i64> [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add <4 x i64> [[X]], splat (i64 -1) +; SLOW-NEXT: [[TMP1:%.*]] = xor <4 x i64> [[X]], [[TMP0]] +; SLOW-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i64> [[TMP1]], [[TMP0]] +; SLOW-NEXT: ret <4 x i1> [[TMP2]] +; +; FAST-LABEL: define <4 x i1> @test_eq_1( +; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0]] { +; FAST-NEXT: [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp eq <4 x i64> [[CTPOP]], splat (i64 1) +; FAST-NEXT: ret <4 x i1> [[CMP1]] +; + %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x) + %cmp = icmp eq <4 x i64> %ctpop, splat (i64 1) + ret <4 x i1> %cmp +} + +define <4 x i1> @test_ne_1(<4 x i64> %x) { +; SLOW-LABEL: define <4 x i1> @test_ne_1( +; SLOW-SAME: <4 x i64> [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add <4 x i64> [[X]], splat (i64 -1) +; SLOW-NEXT: [[TMP1:%.*]] = xor <4 x i64> [[X]], [[TMP0]] +; SLOW-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[TMP1]], [[TMP0]] +; SLOW-NEXT: ret <4 x i1> [[TMP2]] +; +; FAST-LABEL: define <4 x i1> @test_ne_1( +; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0]] { +; FAST-NEXT: [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp ne <4 x i64> [[CTPOP]], splat (i64 1) +; FAST-NEXT: ret <4 x i1> [[CMP1]] +; + %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x) + %cmp = icmp ne <4 x i64> %ctpop, splat (i64 1) + ret <4 x i1> %cmp +} diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll new file mode 100644 index 0000000000000..f5a4f913e6c06 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p 'require,function(codegenprepare)' -S %s \ +; RUN: | FileCheck %s --check-prefix=SLOW +; RUN: opt -p 'require,function(codegenprepare)' -S --mattr=+zbb %s \ +; RUN: | FileCheck %s --check-prefix=FAST +; REQUIRES: riscv-registered-target + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64" + +define i1 @test_ult_2(i64 %x) { +; SLOW-LABEL: define i1 @test_ult_2( +; SLOW-SAME: i64 [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1 +; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]] +; SLOW-NEXT: [[CMP1:%.*]] = icmp eq i64 [[TMP1]], 0 +; SLOW-NEXT: ret i1 [[CMP1]] +; +; FAST-LABEL: define i1 @test_ult_2( +; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp ult i64 [[CTPOP]], 2 +; FAST-NEXT: ret i1 [[CMP1]] +; + %ctpop = call i64 @llvm.ctpop(i64 %x) + %cmp = icmp ult i64 %ctpop, 2 + ret i1 %cmp +} + +define i1 @test_ugt_1(i64 %x) { +; SLOW-LABEL: define i1 @test_ugt_1( +; SLOW-SAME: i64 [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1 +; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]] +; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0 +; SLOW-NEXT: ret i1 [[CMP1]] +; +; FAST-LABEL: define i1 @test_ugt_1( +; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] { +; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1 +; FAST-NEXT: ret i1 [[CMP1]] +; + %ctpop = call i64 @llvm.ctpop(i64 %x) + %cmp = icmp ugt i64 %ctpop, 1 + ret i1 %cmp +} + +define i1 @test_eq_1_nz(i64 %x) { +; SLOW-LABEL: define i1 @test_eq_1_nz( +; SLOW-SAME: i64 [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1 +; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]] +; SLOW-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0 +; SLOW-NEXT: ret i1 [[TMP2]] +; +; FAST-LABEL: define i1 @test_eq_1_nz( +; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] { +; FAST-NEXT: [[CTPOP:%.*]] = call range(i64 1, 33) i64 @llvm.ctpop.i64(i64 [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp ult i64 [[CTPOP]], 2 +; FAST-NEXT: ret i1 [[CMP1]] +; + %ctpop = call range(i64 1, 33) i64 @llvm.ctpop(i64 %x) + %cmp = icmp eq i64 %ctpop, 1 + ret i1 %cmp +} + +define i1 @test_ne_1_nz(i64 %x) { +; SLOW-LABEL: define i1 @test_ne_1_nz( +; SLOW-SAME: i64 [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1 +; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]] +; SLOW-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; SLOW-NEXT: ret i1 [[TMP2]] +; +; FAST-LABEL: define i1 @test_ne_1_nz( +; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] { +; FAST-NEXT: [[CTPOP:%.*]] = call range(i64 1, 33) i64 @llvm.ctpop.i64(i64 [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1 +; FAST-NEXT: ret i1 [[CMP1]] +; + %ctpop = call range(i64 1, 33) i64 @llvm.ctpop(i64 %x) + %cmp = icmp ne i64 %ctpop, 1 + ret i1 %cmp +} + +define i1 @test_eq_1(i64 %x) { +; SLOW-LABEL: define i1 @test_eq_1( +; SLOW-SAME: i64 [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1 +; SLOW-NEXT: [[TMP1:%.*]] = xor i64 [[X]], [[TMP0]] +; SLOW-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[TMP0]] +; SLOW-NEXT: ret i1 [[TMP2]] +; +; FAST-LABEL: define i1 @test_eq_1( +; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] { +; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp eq i64 [[CTPOP]], 1 +; FAST-NEXT: ret i1 [[CMP1]] +; + %ctpop = call i64 @llvm.ctpop(i64 %x) + %cmp = icmp eq i64 %ctpop, 1 + ret i1 %cmp +} + +define i1 @test_ne_1(i64 %x) { +; SLOW-LABEL: define i1 @test_ne_1( +; SLOW-SAME: i64 [[X:%.*]]) { +; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1 +; SLOW-NEXT: [[TMP1:%.*]] = xor i64 [[X]], [[TMP0]] +; SLOW-NEXT: [[TMP2:%.*]] = icmp ule i64 [[TMP1]], [[TMP0]] +; SLOW-NEXT: ret i1 [[TMP2]] +; +; FAST-LABEL: define i1 @test_ne_1( +; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] { +; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]]) +; FAST-NEXT: [[CMP1:%.*]] = icmp ne i64 [[CTPOP]], 1 +; FAST-NEXT: ret i1 [[CMP1]] +; + %ctpop = call i64 @llvm.ctpop(i64 %x) + %cmp = icmp ne i64 %ctpop, 1 + ret i1 %cmp +} From 1a78ef9a9eddd73de7932f5c33a7a7ad7e8b1806 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 23 Apr 2025 08:00:57 +0200 Subject: [PATCH 016/245] [clang][bytecode] Allow casts from void* only in std::allocator calls (#136714) Otherwise, add the missing diagnostic. --- clang/lib/AST/ByteCode/Interp.h | 26 ++++++++--- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 57 +----------------------- clang/lib/AST/ByteCode/InterpState.cpp | 30 +++++++++++++ clang/lib/AST/ByteCode/InterpState.h | 8 ++++ clang/test/AST/ByteCode/c.c | 11 ++--- clang/test/AST/ByteCode/cxx11.cpp | 1 + clang/test/AST/ByteCode/cxx23.cpp | 6 +++ clang/test/AST/ByteCode/cxx26.cpp | 5 +++ 8 files changed, 79 insertions(+), 65 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 866d8e4bf2251..e5300b7cd96a9 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1113,6 +1113,12 @@ inline bool CmpHelperEQ(InterpState &S, CodePtr OpPC, CompareFn Fn) { << P.toDiagnosticString(S.getASTContext()); return false; } + } else if (BothNonNull && P.isIntegralPointer()) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_pointer_constant_comparison) + << LHS.toDiagnosticString(S.getASTContext()) + << RHS.toDiagnosticString(S.getASTContext()); + return false; } } @@ -2389,7 +2395,18 @@ static inline bool PtrPtrCast(InterpState &S, CodePtr OpPC, bool SrcIsVoidPtr) { bool HasValidResult = !Ptr.isZero(); if (HasValidResult) { - // FIXME: note_constexpr_invalid_void_star_cast + if (S.getStdAllocatorCaller("allocate")) + return true; + + const auto &E = cast(S.Current->getExpr(OpPC)); + if (S.getLangOpts().CPlusPlus26 && + S.getASTContext().hasSimilarType(Ptr.getType(), + E->getType()->getPointeeType())) + return true; + + S.CCEDiag(E, diag::note_constexpr_invalid_void_star_cast) + << E->getSubExpr()->getType() << S.getLangOpts().CPlusPlus26 + << Ptr.getType().getCanonicalType() << E->getType()->getPointeeType(); } else if (!S.getLangOpts().CPlusPlus26) { const SourceInfo &E = S.Current->getSource(OpPC); S.CCEDiag(E, diag::note_constexpr_invalid_cast) @@ -2781,10 +2798,9 @@ template ::T> inline bool GetIntPtr(InterpState &S, CodePtr OpPC, const Descriptor *Desc) { const T &IntVal = S.Stk.pop(); - if (Desc) - S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_invalid_cast) - << diag::ConstexprInvalidCastKind::ThisConversionOrReinterpret - << S.getLangOpts().CPlusPlus; + S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_invalid_cast) + << diag::ConstexprInvalidCastKind::ThisConversionOrReinterpret + << S.getLangOpts().CPlusPlus; S.Stk.push(static_cast(IntVal), Desc); return true; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 523e471d3c82c..d8b320ff3ba31 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1526,34 +1526,7 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC, // A call to __operator_new is only valid within std::allocate<>::allocate. // Walk up the call stack to find the appropriate caller and get the // element type from it. - QualType ElemType; - const CallExpr *NewCall = nullptr; - - for (const InterpFrame *F = Frame; F; F = F->Caller) { - const Function *Func = F->getFunction(); - if (!Func) - continue; - const auto *MD = dyn_cast_if_present(Func->getDecl()); - if (!MD) - continue; - const IdentifierInfo *FnII = MD->getIdentifier(); - if (!FnII || !FnII->isStr("allocate")) - continue; - - const auto *CTSD = - dyn_cast(MD->getParent()); - if (!CTSD) - continue; - - const IdentifierInfo *ClassII = CTSD->getIdentifier(); - const TemplateArgumentList &TAL = CTSD->getTemplateArgs(); - if (CTSD->isInStdNamespace() && ClassII && ClassII->isStr("allocator") && - TAL.size() >= 1 && TAL[0].getKind() == TemplateArgument::Type) { - ElemType = TAL[0].getAsType(); - NewCall = cast(F->Caller->getExpr(F->getRetPC())); - break; - } - } + auto [NewCall, ElemType] = S.getStdAllocatorCaller("allocate"); if (ElemType.isNull()) { S.FFDiag(Call, S.getLangOpts().CPlusPlus20 @@ -1655,33 +1628,7 @@ static bool interp__builtin_operator_delete(InterpState &S, CodePtr OpPC, return false; // This is permitted only within a call to std::allocator::deallocate. - bool DeallocateFrameFound = false; - for (const InterpFrame *F = Frame; F; F = F->Caller) { - const Function *Func = F->getFunction(); - if (!Func) - continue; - const auto *MD = dyn_cast_if_present(Func->getDecl()); - if (!MD) - continue; - const IdentifierInfo *FnII = MD->getIdentifier(); - if (!FnII || !FnII->isStr("deallocate")) - continue; - - const auto *CTSD = - dyn_cast(MD->getParent()); - if (!CTSD) - continue; - - const IdentifierInfo *ClassII = CTSD->getIdentifier(); - const TemplateArgumentList &TAL = CTSD->getTemplateArgs(); - if (CTSD->isInStdNamespace() && ClassII && ClassII->isStr("allocator") && - TAL.size() >= 1 && TAL[0].getKind() == TemplateArgument::Type) { - DeallocateFrameFound = true; - break; - } - } - - if (!DeallocateFrameFound) { + if (!S.getStdAllocatorCaller("deallocate")) { S.FFDiag(Call); return true; } diff --git a/clang/lib/AST/ByteCode/InterpState.cpp b/clang/lib/AST/ByteCode/InterpState.cpp index 70a2e9b62fc3a..d6e6771f0a04f 100644 --- a/clang/lib/AST/ByteCode/InterpState.cpp +++ b/clang/lib/AST/ByteCode/InterpState.cpp @@ -115,3 +115,33 @@ bool InterpState::maybeDiagnoseDanglingAllocations() { } return NoAllocationsLeft; } + +StdAllocatorCaller InterpState::getStdAllocatorCaller(StringRef Name) const { + for (const InterpFrame *F = Current; F; F = F->Caller) { + const Function *Func = F->getFunction(); + if (!Func) + continue; + const auto *MD = dyn_cast_if_present(Func->getDecl()); + if (!MD) + continue; + const IdentifierInfo *FnII = MD->getIdentifier(); + if (!FnII || !FnII->isStr(Name)) + continue; + + const auto *CTSD = + dyn_cast(MD->getParent()); + if (!CTSD) + continue; + + const IdentifierInfo *ClassII = CTSD->getIdentifier(); + const TemplateArgumentList &TAL = CTSD->getTemplateArgs(); + if (CTSD->isInStdNamespace() && ClassII && ClassII->isStr("allocator") && + TAL.size() >= 1 && TAL[0].getKind() == TemplateArgument::Type) { + QualType ElemType = TAL[0].getAsType(); + const auto *NewCall = cast(F->Caller->getExpr(F->getRetPC())); + return {NewCall, ElemType}; + } + } + + return {}; +} diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h index 528c1a24e7b05..91e09a911ce37 100644 --- a/clang/lib/AST/ByteCode/InterpState.h +++ b/clang/lib/AST/ByteCode/InterpState.h @@ -32,6 +32,12 @@ class InterpStack; class InterpFrame; class SourceMapper; +struct StdAllocatorCaller { + const Expr *Call = nullptr; + QualType AllocType; + explicit operator bool() { return Call; } +}; + /// Interpreter context. class InterpState final : public State, public SourceMapper { public: @@ -116,6 +122,8 @@ class InterpState final : public State, public SourceMapper { /// \c true otherwise. bool maybeDiagnoseDanglingAllocations(); + StdAllocatorCaller getStdAllocatorCaller(StringRef Name) const; + private: friend class EvaluationResult; friend class InterpStateCCOverride; diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c index fe47f9cab1c9f..a7b1fe07f6d84 100644 --- a/clang/test/AST/ByteCode/c.c +++ b/clang/test/AST/ByteCode/c.c @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -verify=expected,all -std=c11 -Wcast-qual %s -// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -pedantic -verify=pedantic,pedantic-expected,all -std=c11 -Wcast-qual %s -// RUN: %clang_cc1 -triple x86_64-linux -verify=ref,all -std=c11 -Wcast-qual %s -// RUN: %clang_cc1 -triple x86_64-linux -pedantic -verify=pedantic,pedantic-ref,all -std=c11 -Wcast-qual %s +// RUN: %clang_cc1 -triple x86_64-linux -verify=expected,all -std=c11 -Wcast-qual %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -triple x86_64-linux -verify=pedantic,pedantic-expected,all -std=c11 -Wcast-qual -pedantic %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -triple x86_64-linux -verify=ref,all -std=c11 -Wcast-qual %s +// RUN: %clang_cc1 -triple x86_64-linux -verify=pedantic,pedantic-ref,all -std=c11 -Wcast-qual -pedantic %s typedef __INTPTR_TYPE__ intptr_t; typedef __PTRDIFF_TYPE__ ptrdiff_t; @@ -231,7 +231,8 @@ int castViaInt[*(int*)(unsigned long)"test"]; // ref-error {{variable length arr // expected-error {{variable length array}} \ // pedantic-expected-error {{variable length array}} -const void (*const funcp)(void) = (void*)123; // pedantic-warning {{converts between void pointer and function pointer}} +const void (*const funcp)(void) = (void*)123; // pedantic-warning {{converts between void pointer and function pointer}} \ + // pedantic-expected-note {{this conversion is not allowed in a constant expression}} _Static_assert(funcp == (void*)0, ""); // all-error {{failed due to requirement 'funcp == (void *)0'}} \ // pedantic-warning {{expression is not an integer constant expression}} _Static_assert(funcp == (void*)123, ""); // pedantic-warning {{equality comparison between function pointer and void pointer}} \ diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp index 4c69517304ea7..004f704145afd 100644 --- a/clang/test/AST/ByteCode/cxx11.cpp +++ b/clang/test/AST/ByteCode/cxx11.cpp @@ -191,5 +191,6 @@ namespace DynamicCast { constexpr S* sptr = &s; struct Str { int b : reinterpret_cast(sptr) == reinterpret_cast(sptr); + int g : (S*)(void*)(sptr) == sptr; }; } diff --git a/clang/test/AST/ByteCode/cxx23.cpp b/clang/test/AST/ByteCode/cxx23.cpp index d0ade4f5278b1..ce18a9d473302 100644 --- a/clang/test/AST/ByteCode/cxx23.cpp +++ b/clang/test/AST/ByteCode/cxx23.cpp @@ -316,3 +316,9 @@ namespace ZeroSizedArray { } static_assert(foo() == 1); } +namespace VoidCast { + constexpr int a = 12; + constexpr const int *b = &a; + constexpr int *f = (int*)(void*)b; // all-error {{must be initialized by a constant expression}} \ + // all-note {{cast from 'void *' is not allowed in a constant expression}} +} diff --git a/clang/test/AST/ByteCode/cxx26.cpp b/clang/test/AST/ByteCode/cxx26.cpp index cd6b533065010..cd786b17ca9ab 100644 --- a/clang/test/AST/ByteCode/cxx26.cpp +++ b/clang/test/AST/ByteCode/cxx26.cpp @@ -31,3 +31,8 @@ namespace ReplaceableAlloc { static_assert(foo()); // both-error {{not an integral constant expression}} \ // both-note {{in call to}} } + +constexpr int a = 12; +constexpr const int *b = &a; +constexpr int *f = (int*)(void*)b; +static_assert(*f == 12); From 832ca744f2f25a7a5334f2f04380c84e41f71678 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Wed, 23 Apr 2025 14:16:23 +0800 Subject: [PATCH 017/245] [RISCV] Add Andes N45/NX45 processor definition (#136670) Andes N45/NX45 are 32/64bit in-order dual-issue 8-stage pipeline CPU architecture implementing the RV[32|64]IMAFDC_Zba_Zbb_Zbs ISA extensions. They are developed by Andes Technology https://www.andestech.com, a RISC-V IP provider. The overviews for N45/NX45: https://www.andestech.com/en/products-solutions/andescore-processors/riscv-n45/ https://www.andestech.com/en/products-solutions/andescore-processors/riscv-nx45/ Scheduling model will be implemented in a later PR. --- clang/test/Driver/riscv-cpus.c | 34 +++++++++++++++++++ .../test/Misc/target-invalid-cpu-note/riscv.c | 12 ++++--- llvm/docs/ReleaseNotes.md | 1 + llvm/lib/Target/RISCV/RISCVProcessors.td | 26 ++++++++++++++ 4 files changed, 69 insertions(+), 4 deletions(-) diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c index c2314efd34aa6..19da8ede26a40 100644 --- a/clang/test/Driver/riscv-cpus.c +++ b/clang/test/Driver/riscv-cpus.c @@ -692,3 +692,37 @@ // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mtune=syntacore-scr7 | FileCheck -check-prefix=MTUNE-SYNTACORE-SCR7 %s // MTUNE-SYNTACORE-SCR7: "-tune-cpu" "syntacore-scr7" + +// RUN: %clang --target=riscv32 -### -c %s 2>&1 -mcpu=andes-n45 | FileCheck -check-prefix=MCPU-ANDES-N45 %s +// MCPU-ANDES-N45: "-target-cpu" "andes-n45" +// MCPU-ANDES-N45-SAME: "-target-feature" "+m" +// MCPU-ANDES-N45-SAME: "-target-feature" "+a" +// MCPU-ANDES-N45-SAME: "-target-feature" "+f" +// MCPU-ANDES-N45-SAME: "-target-feature" "+d" +// MCPU-ANDES-N45-SAME: "-target-feature" "+c" +// MCPU-ANDES-N45-SAME: "-target-feature" "+zicsr" +// MCPU-ANDES-N45-SAME: "-target-feature" "+zifencei" +// MCPU-ANDES-N45-SAME: "-target-feature" "+zba" +// MCPU-ANDES-N45-SAME: "-target-feature" "+zbb" +// MCPU-ANDES-N45-SAME: "-target-feature" "+zbs" +// MCPU-ANDES-N45-SAME: "-target-abi" "ilp32d" + +// RUN: %clang --target=riscv32 -### -c %s 2>&1 -mtune=andes-n45 | FileCheck -check-prefix=MTUNE-ANDES-N45 %s +// MTUNE-ANDES-N45: "-tune-cpu" "andes-n45" + +// RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=andes-nx45 | FileCheck -check-prefix=MCPU-ANDES-NX45 %s +// MCPU-ANDES-NX45: "-target-cpu" "andes-nx45" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+m" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+a" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+f" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+d" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+c" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+zicsr" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+zifencei" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+zba" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+zbb" +// MCPU-ANDES-NX45-SAME: "-target-feature" "+zbs" +// MCPU-ANDES-NX45-SAME: "-target-abi" "lp64d" + +// RUN: %clang --target=riscv64 -### -c %s 2>&1 -mtune=andes-nx45 | FileCheck -check-prefix=MTUNE-ANDES-NX45 %s +// MTUNE-ANDES-NX45: "-tune-cpu" "andes-nx45" diff --git a/clang/test/Misc/target-invalid-cpu-note/riscv.c b/clang/test/Misc/target-invalid-cpu-note/riscv.c index 199916f70c14f..cd8a8bf95dd7a 100644 --- a/clang/test/Misc/target-invalid-cpu-note/riscv.c +++ b/clang/test/Misc/target-invalid-cpu-note/riscv.c @@ -5,7 +5,8 @@ // RUN: not %clang_cc1 -triple riscv32 -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix RISCV32 // RISCV32: error: unknown target CPU 'not-a-cpu' // RISCV32-NEXT: note: valid target CPU values are: -// RISCV32-SAME: {{^}} generic-rv32 +// RISCV32-SAME: {{^}} andes-n45 +// RISCV32-SAME: {{^}}, generic-rv32 // RISCV32-SAME: {{^}}, rocket-rv32 // RISCV32-SAME: {{^}}, rp2350-hazard3 // RISCV32-SAME: {{^}}, sifive-e20 @@ -24,7 +25,8 @@ // RUN: not %clang_cc1 -triple riscv64 -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix RISCV64 // RISCV64: error: unknown target CPU 'not-a-cpu' // RISCV64-NEXT: note: valid target CPU values are: -// RISCV64-SAME: {{^}} generic-rv64 +// RISCV64-SAME: {{^}} andes-nx45 +// RISCV64-SAME: {{^}}, generic-rv64 // RISCV64-SAME: {{^}}, mips-p8700 // RISCV64-SAME: {{^}}, rocket-rv64 // RISCV64-SAME: {{^}}, sifive-p450 @@ -52,7 +54,8 @@ // RUN: not %clang_cc1 -triple riscv32 -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE-RISCV32 // TUNE-RISCV32: error: unknown target CPU 'not-a-cpu' // TUNE-RISCV32-NEXT: note: valid target CPU values are: -// TUNE-RISCV32-SAME: {{^}} generic-rv32 +// TUNE-RISCV32-SAME: {{^}} andes-n45 +// TUNE-RISCV32-SAME: {{^}}, generic-rv32 // TUNE-RISCV32-SAME: {{^}}, rocket-rv32 // TUNE-RISCV32-SAME: {{^}}, rp2350-hazard3 // TUNE-RISCV32-SAME: {{^}}, sifive-e20 @@ -75,7 +78,8 @@ // RUN: not %clang_cc1 -triple riscv64 -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE-RISCV64 // TUNE-RISCV64: error: unknown target CPU 'not-a-cpu' // TUNE-RISCV64-NEXT: note: valid target CPU values are: -// TUNE-RISCV64-SAME: {{^}} generic-rv64 +// TUNE-RISCV64-SAME: {{^}} andes-nx45 +// TUNE-RISCV64-SAME: {{^}}, generic-rv64 // TUNE-RISCV64-SAME: {{^}}, mips-p8700 // TUNE-RISCV64-SAME: {{^}}, rocket-rv64 // TUNE-RISCV64-SAME: {{^}}, sifive-p450 diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index cc4fd38944ff6..a3f91224ca24e 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -168,6 +168,7 @@ Changes to the RISC-V Backend and branch and linker relaxation. This can be disabled with ``.option noexact``, which is also the default. * `-mcpu=xiangshan-kunminghu` was added. +* `-mcpu=andes-n45` and `-mcpu=andes-nx45` were added. Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 4b288a9cfcb49..5e52ba87cb346 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -625,3 +625,29 @@ def RP2350_HAZARD3 : RISCVProcessorModel<"rp2350-hazard3", FeatureStdExtZbkb, FeatureStdExtZcb, FeatureStdExtZcmp]>; + +def ANDES_N45 : RISCVProcessorModel<"andes-n45", + NoSchedModel, + [Feature32Bit, + FeatureStdExtI, + FeatureStdExtZicsr, + FeatureStdExtZifencei, + FeatureStdExtM, + FeatureStdExtA, + FeatureStdExtF, + FeatureStdExtD, + FeatureStdExtC, + FeatureStdExtB]>; + +def ANDES_NX45 : RISCVProcessorModel<"andes-nx45", + NoSchedModel, + [Feature64Bit, + FeatureStdExtI, + FeatureStdExtZicsr, + FeatureStdExtZifencei, + FeatureStdExtM, + FeatureStdExtA, + FeatureStdExtF, + FeatureStdExtD, + FeatureStdExtC, + FeatureStdExtB]>; From 30c47147262523663892836fee42e02f8f9366f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Wed, 23 Apr 2025 07:32:24 +0100 Subject: [PATCH 018/245] [mlir][utils] Update generate-test-checks.py (#136757) At the moment, the `CHECK-SAME` lines generated by "generate-test-checks.py" (i.e. check-lines that correspond to the preceeding `CHECK-LABEL` line) are indented to match the label length. For example, ```mlir func.func @batch_reduce_matmul_bcast_k_to_fill_missing_dims_A(%arg0: memref<5xf32>, %arg1: memref<2x5x7xf32>, %arg2: memref<3x7xf32>) { linalg.batch_reduce_matmul indexing_maps = (...) } ``` will lead to the following: ```mlir // CHECK-LABEL: func.func @batch_reduce_matmul_bcast_k_to_fill_missing_dims_A( // CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<2x5x7xf32>, // CHECK-SAME: %[[VAL_2:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<3x7xf32>) { // CHECK: linalg.batch_reduce_matmul indexing_maps = (...) ``` This indentation is unnecasarilly deep. With this change, for labales that are longer than 20 chars, the indentation is trimmed to 4 spaces: ```mlir // CHECK-LABEL: func.func @batch_reduce_matmul_bcast_k_to_fill_missing_dims_A( // CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<2x5x7xf32>, // CHECK-SAME: %[[VAL_2:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<3x7xf32>) { // CHECK: linalg.batch_reduce_matmul indexing_maps = (...) ``` --- mlir/utils/generate-test-checks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py index f1dd7a2843893..d157af9c3cab7 100755 --- a/mlir/utils/generate-test-checks.py +++ b/mlir/utils/generate-test-checks.py @@ -408,8 +408,12 @@ def main(): for argument in ssa_split[1:]: output_line += "// " + args.check_prefix + "-SAME: " - # Pad to align with the original position in the line. - output_line += " " * len(ssa_split[0]) + # Pad to align with the original position in the line (i.e. where the label ends), + # unless the label is more than 20 chars long, in which case pad with 4 spaces + # (this is to avoid deep indentation). + label_length = len(ssa_split[0]) + pad_depth = label_length if label_length < 21 else 4 + output_line += " " * pad_depth # Process the rest of the line. output_line += process_line( From 665914fea1433409015a87fef2837218bcd21460 Mon Sep 17 00:00:00 2001 From: Mythreya Date: Tue, 22 Apr 2025 23:36:48 -0700 Subject: [PATCH 019/245] [clangd] Improve `BlockEnd` inlayhint presentation (#136106) * Only show for blocks 10 lines or taller (including braces) * Add parens for function call: "// if foo" -> "// if foo()" or "// if foo(...)" * Print literal nullptr * Escaping for abbreviated strings Fixes https://github.com/clangd/clangd/issues/1807. Based on the original PR at https://github.com/llvm/llvm-project/pull/72345. Co-authored-by: daiyousei-qz --- clang-tools-extra/clangd/InlayHints.cpp | 33 +++-- clang-tools-extra/clangd/InlayHints.h | 9 +- .../clangd/unittests/InlayHintTests.cpp | 140 ++++++++++++++---- 3 files changed, 138 insertions(+), 44 deletions(-) diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp index 40a824618f782..bdab2b8a9f377 100644 --- a/clang-tools-extra/clangd/InlayHints.cpp +++ b/clang-tools-extra/clangd/InlayHints.cpp @@ -112,7 +112,9 @@ std::string summarizeExpr(const Expr *E) { return getSimpleName(*E->getFoundDecl()).str(); } std::string VisitCallExpr(const CallExpr *E) { - return Visit(E->getCallee()); + std::string Result = Visit(E->getCallee()); + Result += E->getNumArgs() == 0 ? "()" : "(...)"; + return Result; } std::string VisitCXXDependentScopeMemberExpr(const CXXDependentScopeMemberExpr *E) { @@ -147,6 +149,9 @@ std::string summarizeExpr(const Expr *E) { } // Literals are just printed + std::string VisitCXXNullPtrLiteralExpr(const CXXNullPtrLiteralExpr *E) { + return "nullptr"; + } std::string VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *E) { return E->getValue() ? "true" : "false"; } @@ -165,12 +170,14 @@ std::string summarizeExpr(const Expr *E) { std::string Result = "\""; if (E->containsNonAscii()) { Result += "..."; - } else if (E->getLength() > 10) { - Result += E->getString().take_front(7); - Result += "..."; } else { llvm::raw_string_ostream OS(Result); - llvm::printEscapedString(E->getString(), OS); + if (E->getLength() > 10) { + llvm::printEscapedString(E->getString().take_front(7), OS); + Result += "..."; + } else { + llvm::printEscapedString(E->getString(), OS); + } } Result.push_back('"'); return Result; @@ -408,12 +415,14 @@ struct Callee { class InlayHintVisitor : public RecursiveASTVisitor { public: InlayHintVisitor(std::vector &Results, ParsedAST &AST, - const Config &Cfg, std::optional RestrictRange) + const Config &Cfg, std::optional RestrictRange, + InlayHintOptions HintOptions) : Results(Results), AST(AST.getASTContext()), Tokens(AST.getTokens()), Cfg(Cfg), RestrictRange(std::move(RestrictRange)), MainFileID(AST.getSourceManager().getMainFileID()), Resolver(AST.getHeuristicResolver()), - TypeHintPolicy(this->AST.getPrintingPolicy()) { + TypeHintPolicy(this->AST.getPrintingPolicy()), + HintOptions(HintOptions) { bool Invalid = false; llvm::StringRef Buf = AST.getSourceManager().getBufferData(MainFileID, &Invalid); @@ -1120,7 +1129,6 @@ class InlayHintVisitor : public RecursiveASTVisitor { // Otherwise, the hint shouldn't be shown. std::optional computeBlockEndHintRange(SourceRange BraceRange, StringRef OptionalPunctuation) { - constexpr unsigned HintMinLineLimit = 2; auto &SM = AST.getSourceManager(); auto [BlockBeginFileId, BlockBeginOffset] = @@ -1148,7 +1156,7 @@ class InlayHintVisitor : public RecursiveASTVisitor { auto RBraceLine = SM.getLineNumber(RBraceFileId, RBraceOffset); // Don't show hint on trivial blocks like `class X {};` - if (BlockBeginLine + HintMinLineLimit - 1 > RBraceLine) + if (BlockBeginLine + HintOptions.HintMinLineLimit - 1 > RBraceLine) return std::nullopt; // This is what we attach the hint to, usually "}" or "};". @@ -1178,17 +1186,20 @@ class InlayHintVisitor : public RecursiveASTVisitor { StringRef MainFileBuf; const HeuristicResolver *Resolver; PrintingPolicy TypeHintPolicy; + InlayHintOptions HintOptions; }; } // namespace std::vector inlayHints(ParsedAST &AST, - std::optional RestrictRange) { + std::optional RestrictRange, + InlayHintOptions HintOptions) { std::vector Results; const auto &Cfg = Config::current(); if (!Cfg.InlayHints.Enabled) return Results; - InlayHintVisitor Visitor(Results, AST, Cfg, std::move(RestrictRange)); + InlayHintVisitor Visitor(Results, AST, Cfg, std::move(RestrictRange), + HintOptions); Visitor.TraverseAST(AST.getASTContext()); // De-duplicate hints. Duplicates can sometimes occur due to e.g. explicit diff --git a/clang-tools-extra/clangd/InlayHints.h b/clang-tools-extra/clangd/InlayHints.h index 6a0236a0ab08a..612434b728456 100644 --- a/clang-tools-extra/clangd/InlayHints.h +++ b/clang-tools-extra/clangd/InlayHints.h @@ -22,10 +22,17 @@ namespace clang { namespace clangd { class ParsedAST; +struct InlayHintOptions { + // Minimum height of a code block in lines for a BlockEnd hint to be shown + // Includes the lines containing the braces + int HintMinLineLimit = 10; +}; + /// Compute and return inlay hints for a file. /// If RestrictRange is set, return only hints whose location is in that range. std::vector inlayHints(ParsedAST &AST, - std::optional RestrictRange); + std::optional RestrictRange, + InlayHintOptions HintOptions = {}); } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp index 030e499577706..c3331d20730f1 100644 --- a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp +++ b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp @@ -36,9 +36,12 @@ namespace { using ::testing::ElementsAre; using ::testing::IsEmpty; -std::vector hintsOfKind(ParsedAST &AST, InlayHintKind Kind) { +constexpr InlayHintOptions DefaultOptsForTests{2}; + +std::vector hintsOfKind(ParsedAST &AST, InlayHintKind Kind, + InlayHintOptions Opts) { std::vector Result; - for (auto &Hint : inlayHints(AST, /*RestrictRange=*/std::nullopt)) { + for (auto &Hint : inlayHints(AST, /*RestrictRange=*/std::nullopt, Opts)) { if (Hint.kind == Kind) Result.push_back(Hint); } @@ -90,7 +93,7 @@ Config noHintsConfig() { template void assertHintsWithHeader(InlayHintKind Kind, llvm::StringRef AnnotatedSource, - llvm::StringRef HeaderContent, + llvm::StringRef HeaderContent, InlayHintOptions Opts, ExpectedHints... Expected) { Annotations Source(AnnotatedSource); TestTU TU = TestTU::withCode(Source.code()); @@ -98,18 +101,18 @@ void assertHintsWithHeader(InlayHintKind Kind, llvm::StringRef AnnotatedSource, TU.HeaderCode = HeaderContent; auto AST = TU.build(); - EXPECT_THAT(hintsOfKind(AST, Kind), + EXPECT_THAT(hintsOfKind(AST, Kind, Opts), ElementsAre(HintMatcher(Expected, Source)...)); // Sneak in a cross-cutting check that hints are disabled by config. // We'll hit an assertion failure if addInlayHint still gets called. WithContextValue WithCfg(Config::Key, noHintsConfig()); - EXPECT_THAT(inlayHints(AST, std::nullopt), IsEmpty()); + EXPECT_THAT(inlayHints(AST, std::nullopt, Opts), IsEmpty()); } template void assertHints(InlayHintKind Kind, llvm::StringRef AnnotatedSource, - ExpectedHints... Expected) { - return assertHintsWithHeader(Kind, AnnotatedSource, "", + InlayHintOptions Opts, ExpectedHints... Expected) { + return assertHintsWithHeader(Kind, AnnotatedSource, "", Opts, std::move(Expected)...); } @@ -120,14 +123,16 @@ template void assertParameterHints(llvm::StringRef AnnotatedSource, ExpectedHints... Expected) { ignore(Expected.Side = Left...); - assertHints(InlayHintKind::Parameter, AnnotatedSource, Expected...); + assertHints(InlayHintKind::Parameter, AnnotatedSource, DefaultOptsForTests, + Expected...); } template void assertTypeHints(llvm::StringRef AnnotatedSource, ExpectedHints... Expected) { ignore(Expected.Side = Right...); - assertHints(InlayHintKind::Type, AnnotatedSource, Expected...); + assertHints(InlayHintKind::Type, AnnotatedSource, DefaultOptsForTests, + Expected...); } template @@ -136,16 +141,25 @@ void assertDesignatorHints(llvm::StringRef AnnotatedSource, Config Cfg; Cfg.InlayHints.Designators = true; WithContextValue WithCfg(Config::Key, std::move(Cfg)); - assertHints(InlayHintKind::Designator, AnnotatedSource, Expected...); + assertHints(InlayHintKind::Designator, AnnotatedSource, DefaultOptsForTests, + Expected...); } template -void assertBlockEndHints(llvm::StringRef AnnotatedSource, - ExpectedHints... Expected) { +void assertBlockEndHintsWithOpts(llvm::StringRef AnnotatedSource, + InlayHintOptions Opts, + ExpectedHints... Expected) { Config Cfg; Cfg.InlayHints.BlockEnd = true; WithContextValue WithCfg(Config::Key, std::move(Cfg)); - assertHints(InlayHintKind::BlockEnd, AnnotatedSource, Expected...); + assertHints(InlayHintKind::BlockEnd, AnnotatedSource, Opts, Expected...); +} + +template +void assertBlockEndHints(llvm::StringRef AnnotatedSource, + ExpectedHints... Expected) { + assertBlockEndHintsWithOpts(AnnotatedSource, DefaultOptsForTests, + Expected...); } TEST(ParameterHints, Smoke) { @@ -1226,7 +1240,9 @@ TEST(ParameterHints, IncludeAtNonGlobalScope) { ASSERT_TRUE(bool(AST)); // Ensure the hint for the call in foo.inc is NOT materialized in foo.cc. - EXPECT_EQ(hintsOfKind(*AST, InlayHintKind::Parameter).size(), 0u); + EXPECT_EQ( + hintsOfKind(*AST, InlayHintKind::Parameter, DefaultOptsForTests).size(), + 0u); } TEST(TypeHints, Smoke) { @@ -1488,12 +1504,12 @@ TEST(DefaultArguments, Smoke) { void baz(int = 5) { if (false) baz($unnamed[[)]]; }; )cpp"; - assertHints(InlayHintKind::DefaultArgument, Code, + assertHints(InlayHintKind::DefaultArgument, Code, DefaultOptsForTests, ExpectedHint{"A: 4", "default1", Left}, ExpectedHint{", B: 1, C: foo()", "default2", Left}, ExpectedHint{"5", "unnamed", Left}); - assertHints(InlayHintKind::Parameter, Code, + assertHints(InlayHintKind::Parameter, Code, DefaultOptsForTests, ExpectedHint{"A: ", "explicit", Left}); } @@ -1528,14 +1544,14 @@ TEST(DefaultArguments, WithoutParameterNames) { } )cpp"; - assertHints(InlayHintKind::DefaultArgument, Code, + assertHints(InlayHintKind::DefaultArgument, Code, DefaultOptsForTests, ExpectedHint{"...", "abbreviated", Left}, ExpectedHint{", Baz{}", "paren", Left}, ExpectedHint{", Baz{}", "brace1", Left}, ExpectedHint{", Baz{}", "brace2", Left}, ExpectedHint{", Baz{}", "brace3", Left}); - assertHints(InlayHintKind::Parameter, Code); + assertHints(InlayHintKind::Parameter, Code, DefaultOptsForTests); } TEST(TypeHints, Deduplication) { @@ -1573,7 +1589,8 @@ TEST(TypeHints, Aliased) { TU.ExtraArgs.push_back("-xc"); auto AST = TU.build(); - EXPECT_THAT(hintsOfKind(AST, InlayHintKind::Type), IsEmpty()); + EXPECT_THAT(hintsOfKind(AST, InlayHintKind::Type, DefaultOptsForTests), + IsEmpty()); } TEST(TypeHints, CallingConvention) { @@ -1590,7 +1607,7 @@ TEST(TypeHints, CallingConvention) { auto AST = TU.build(); EXPECT_THAT( - hintsOfKind(AST, InlayHintKind::Type), + hintsOfKind(AST, InlayHintKind::Type, DefaultOptsForTests), ElementsAre(HintMatcher(ExpectedHint{"-> void", "lambda"}, Source))); } @@ -1673,7 +1690,7 @@ TEST(TypeHints, SubstTemplateParameterAliases) { )cpp"; assertHintsWithHeader( - InlayHintKind::Type, VectorIntPtr, Header, + InlayHintKind::Type, VectorIntPtr, Header, DefaultOptsForTests, ExpectedHint{": int *", "no_modifier"}, ExpectedHint{": int **", "ptr_modifier"}, ExpectedHint{": int *&", "ref_modifier"}, @@ -1697,7 +1714,7 @@ TEST(TypeHints, SubstTemplateParameterAliases) { )cpp"; assertHintsWithHeader( - InlayHintKind::Type, VectorInt, Header, + InlayHintKind::Type, VectorInt, Header, DefaultOptsForTests, ExpectedHint{": int", "no_modifier"}, ExpectedHint{": int *", "ptr_modifier"}, ExpectedHint{": int &", "ref_modifier"}, @@ -1724,6 +1741,7 @@ TEST(TypeHints, SubstTemplateParameterAliases) { )cpp"; assertHintsWithHeader(InlayHintKind::Type, TypeAlias, Header, + DefaultOptsForTests, ExpectedHint{": Short", "short_name"}, ExpectedHint{": static_vector", "vector_name"}); } @@ -2016,6 +2034,7 @@ TEST(BlockEndHints, If) { assertBlockEndHints( R"cpp( void foo(bool cond) { + void* ptr; if (cond) ; @@ -2041,13 +2060,17 @@ TEST(BlockEndHints, If) { if (int i = 0; i > 10) { $init_cond[[}]] + + if (ptr != nullptr) { + $null_check[[}]] } // suppress )cpp", ExpectedHint{" // if cond", "simple"}, ExpectedHint{" // if cond", "ifelse"}, ExpectedHint{" // if", "elseif"}, ExpectedHint{" // if !cond", "inner"}, ExpectedHint{" // if cond", "outer"}, ExpectedHint{" // if X", "init"}, - ExpectedHint{" // if i > 10", "init_cond"}); + ExpectedHint{" // if i > 10", "init_cond"}, + ExpectedHint{" // if ptr != nullptr", "null_check"}); } TEST(BlockEndHints, Loops) { @@ -2124,30 +2147,41 @@ TEST(BlockEndHints, PrintRefs) { R"cpp( namespace ns { int Var; - int func(); + int func1(); + int func2(int, int); struct S { int Field; - int method() const; + int method1() const; + int method2(int, int) const; }; // suppress } // suppress void foo() { + int int_a {}; while (ns::Var) { $var[[}]] - while (ns::func()) { - $func[[}]] + while (ns::func1()) { + $func1[[}]] + + while (ns::func2(int_a, int_a)) { + $func2[[}]] while (ns::S{}.Field) { $field[[}]] - while (ns::S{}.method()) { - $method[[}]] + while (ns::S{}.method1()) { + $method1[[}]] + + while (ns::S{}.method2(int_a, int_a)) { + $method2[[}]] } // suppress )cpp", ExpectedHint{" // while Var", "var"}, - ExpectedHint{" // while func", "func"}, + ExpectedHint{" // while func1()", "func1"}, + ExpectedHint{" // while func2(...)", "func2"}, ExpectedHint{" // while Field", "field"}, - ExpectedHint{" // while method", "method"}); + ExpectedHint{" // while method1()", "method1"}, + ExpectedHint{" // while method2(...)", "method2"}); } TEST(BlockEndHints, PrintConversions) { @@ -2307,7 +2341,49 @@ TEST(BlockEndHints, PointerToMemberFunction) { $ptrmem[[}]] } // suppress )cpp", - ExpectedHint{" // if", "ptrmem"}); + ExpectedHint{" // if ()", "ptrmem"}); +} + +TEST(BlockEndHints, MinLineLimit) { + InlayHintOptions Opts; + Opts.HintMinLineLimit = 10; + + // namespace ns below is exactly 10 lines + assertBlockEndHintsWithOpts( + R"cpp( + namespace ns { + int Var; + int func1(); + int func2(int, int); + struct S { + int Field; + int method1() const; + int method2(int, int) const; + }; + $namespace[[}]] + void foo() { + int int_a {}; + while (ns::Var) { + } + + while (ns::func1()) { + } + + while (ns::func2(int_a, int_a)) { + } + + while (ns::S{}.Field) { + } + + while (ns::S{}.method1()) { + } + + while (ns::S{}.method2(int_a, int_a)) { + } + $foo[[}]] + )cpp", + Opts, ExpectedHint{" // namespace ns", "namespace"}, + ExpectedHint{" // foo", "foo"}); } // FIXME: Low-hanging fruit where we could omit a type hint: From 98b6f8dc699d789d834e5b6d810ed217f560aad0 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 23 Apr 2025 07:46:27 +0100 Subject: [PATCH 020/245] [CostModel] Remove optional from InstructionCost::getValue() (#135596) InstructionCost is already an optional value, containing an Invalid state that can be checked with isValid(). There is little point in returning another optional from getValue(). Most uses do not make use of it being a std::optional, dereferencing the value directly (either isValid has been checked previously or the Cost is assumed to be valid). The one case that does in AMDGPU used value_or which has been replaced by a isValid() check. --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 4 ++-- llvm/include/llvm/Support/InstructionCost.h | 8 +++----- llvm/include/llvm/Transforms/Utils/UnrollLoop.h | 2 +- llvm/lib/Analysis/CostModel.cpp | 4 ++-- llvm/lib/CodeGen/SelectOptimize.cpp | 6 +++--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- .../Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 4 ++-- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 4 ++-- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 2 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 2 +- .../Target/SystemZ/SystemZTargetTransformInfo.cpp | 2 +- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 6 +++--- llvm/lib/Transforms/IPO/FunctionSpecialization.cpp | 4 ++-- llvm/lib/Transforms/IPO/PartialInlining.cpp | 2 +- llvm/lib/Transforms/Scalar/ConstantHoisting.cpp | 4 ++-- llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp | 2 +- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 2 +- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 6 +++--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++------ llvm/unittests/Support/InstructionCostTest.cpp | 5 ++--- 22 files changed, 42 insertions(+), 45 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 0ef6bf5d45f4d..80df6d7d956d3 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1610,7 +1610,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Scale the cost of the load by the fraction of legal instructions that // will be used. - Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts); + Cost = divideCeil(UsedInsts.count() * Cost.getValue(), NumLegalInsts); } // Then plus the cost of interleave operation. @@ -2878,7 +2878,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { SubTp && SubTp->getElementType() == FTp->getElementType()) return divideCeil(FTp->getNumElements(), SubTp->getNumElements()); } - return *LT.first.getValue(); + return LT.first.getValue(); } InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h index b5af0e0401ef2..d5f7457e04748 100644 --- a/llvm/include/llvm/Support/InstructionCost.h +++ b/llvm/include/llvm/Support/InstructionCost.h @@ -20,7 +20,6 @@ #include "llvm/Support/MathExtras.h" #include -#include namespace llvm { @@ -84,10 +83,9 @@ class InstructionCost { /// This function is intended to be used as sparingly as possible, since the /// class provides the full range of operator support required for arithmetic /// and comparisons. - std::optional getValue() const { - if (isValid()) - return Value; - return std::nullopt; + CostType getValue() const { + assert(isValid()); + return Value; } /// For all of the arithmetic operators provided here any invalid state is diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h index ed560f6f6e2fa..6759afd8077e9 100644 --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -143,7 +143,7 @@ class UnrollCostEstimator { /// Whether it is legal to unroll this loop. bool canUnroll() const; - uint64_t getRolledLoopSize() const { return *LoopSize.getValue(); } + uint64_t getRolledLoopSize() const { return LoopSize.getValue(); } /// Returns loop size estimation for unrolled loop, given the unrolling /// configuration specified by UP. diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp index cec0fb6b98dea..6d8bd7d924074 100644 --- a/llvm/lib/Analysis/CostModel.cpp +++ b/llvm/lib/Analysis/CostModel.cpp @@ -128,8 +128,8 @@ PreservedAnalyses CostModelPrinterPass::run(Function &F, } else { InstructionCost Cost = getCost(Inst, OutputCostKindToTargetCostKind(CostKind), TTI, TLI); - if (auto CostVal = Cost.getValue()) - OS << "Found an estimated cost of " << *CostVal; + if (Cost.isValid()) + OS << "Found an estimated cost of " << Cost.getValue(); else OS << "Invalid cost"; OS << " for instruction: " << Inst << "\n"; diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp index 00148b075134a..13ed8f28d5507 100644 --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -206,7 +206,7 @@ class SelectOptimizeImpl { getI()->getOpcode(), I->getType(), TargetTransformInfo::TCK_Latency, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2}); - auto TotalCost = Scaled64::get(*Cost.getValue()); + auto TotalCost = Scaled64::get(Cost.getValue()); if (auto *OpI = dyn_cast(I->getOperand(1 - CondIdx))) { auto It = InstCostMap.find(OpI); if (It != InstCostMap.end()) @@ -1380,8 +1380,8 @@ std::optional SelectOptimizeImpl::computeInstCost(const Instruction *I) { InstructionCost ICost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); - if (auto OC = ICost.getValue()) - return std::optional(*OC); + if (ICost.isValid()) + return std::optional(ICost.getValue()); return std::nullopt; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 771eee1b3fecf..0ff681c8dbac6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -28530,7 +28530,7 @@ bool AArch64TargetLowering::shouldLocalize( Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize); assert(Cost.isValid() && "Expected a valid imm cost"); - unsigned RematCost = *Cost.getValue(); + unsigned RematCost = Cost.getValue(); RematCost += AdditionalCost; Register Reg = MI.getOperand(0).getReg(); unsigned MaxUses = maxUses(RematCost); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a20f1c104834d..324e234db6120 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4618,7 +4618,7 @@ static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, } if (FinalSize) - *FinalSize = *LoopCost.getValue(); + *FinalSize = LoopCost.getValue(); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index dd3bec774ec67..1506f02793ba4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -205,8 +205,8 @@ static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M, TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); assert(Cost != InstructionCost::getMax()); // Assume expensive if we can't tell the cost of an instruction. - CostType CostVal = - Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive); + CostType CostVal = Cost.isValid() ? Cost.getValue() + : TargetTransformInfo::TCC_Expensive; assert((FnCost + CostVal) >= FnCost && "Overflow!"); FnCost += CostVal; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index cc2ca77ffb792..9512bcd5c4a13 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1277,9 +1277,9 @@ static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, // The penalty cost is computed relative to the cost of instructions and does // not model any storage costs. adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) * - *ArgStackCost.getValue() * InlineConstants::getInstrCost(); + ArgStackCost.getValue() * InlineConstants::getInstrCost(); adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) * - *ArgStackCost.getValue() * InlineConstants::getInstrCost(); + ArgStackCost.getValue() * InlineConstants::getInstrCost(); return adjustThreshold; } diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index eb487bdaa88b9..bf2a95b2d1ddc 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1096,7 +1096,7 @@ InstructionCost PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode, Type *Src, float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value(); float MisalignmentProb = 1.0 - AlignmentProb; return (MisalignmentProb * P9PipelineFlushEstimate) + - (AlignmentProb * *Cost.getValue()); + (AlignmentProb * Cost.getValue()); } // Usually we should not get to this point, but the following is an attempt to diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 11f2095ac9bce..dadae2e71d44c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2909,7 +2909,7 @@ InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const { bool Log2CostModel = Subtarget.getVRGatherCostModel() == llvm::RISCVSubtarget::NLog2N; if (Log2CostModel && LMULCost.isValid()) { - unsigned Log = Log2_64(*LMULCost.getValue()); + unsigned Log = Log2_64(LMULCost.getValue()); if (Log > 0) return LMULCost * Log; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 2cea601fb1a02..73ebd87cd0a94 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -483,7 +483,7 @@ costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalVT.getVectorNumElements()); - unsigned E = *NumOfDests.getValue(); + unsigned E = NumOfDests.getValue(); unsigned NormalizedVF = LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 53270ac096bb0..ee142ccd20e20 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -380,7 +380,7 @@ void SystemZTTIImpl::getUnrollingPreferences( // The z13 processor will run out of store tags if too many stores // are fed into it too quickly. Therefore make sure there are not // too many stores in the resulting unrolled loop. - unsigned const NumStoresVal = *NumStores.getValue(); + unsigned const NumStoresVal = NumStores.getValue(); unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX); if (HasCall) { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 2375a8f992aa5..74bb25781b534 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1748,7 +1748,7 @@ InstructionCost X86TTIImpl::getShuffleCost( getTypeLegalizationCost( FixedVectorType::get(BaseTp->getElementType(), Mask.size())) .first; - unsigned E = *NumOfDests.getValue(); + unsigned E = NumOfDests.getValue(); unsigned NormalizedVF = LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); @@ -4931,7 +4931,7 @@ InstructionCost X86TTIImpl::getScalarizationOverhead( (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector"); - const int NumLegalVectors = *LT.first.getValue(); + const int NumLegalVectors = LT.first.getValue(); assert(NumLegalVectors >= 0 && "Negative cost!"); // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much @@ -6164,7 +6164,7 @@ InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, std::pair IdxsLT = getTypeLegalizationCost(IndexVTy); std::pair SrcLT = getTypeLegalizationCost(SrcVTy); InstructionCost::CostType SplitFactor = - *std::max(IdxsLT.first, SrcLT.first).getValue(); + std::max(IdxsLT.first, SrcLT.first).getValue(); if (SplitFactor > 1) { // Handle splitting of vector of pointers auto *SplitSrcTy = diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index c13305ce5056d..1034ce9582152 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -662,7 +662,7 @@ FunctionSpecializer::~FunctionSpecializer() { /// non-negative, which is true for both TCK_CodeSize and TCK_Latency, and /// always Valid. static unsigned getCostValue(const Cost &C) { - int64_t Value = *C.getValue(); + int64_t Value = C.getValue(); assert(Value >= 0 && "CodeSize and Latency cannot be negative"); // It is safe to down cast since we know the arguments cannot be negative and @@ -713,7 +713,7 @@ bool FunctionSpecializer::run() { if (!SpecializeLiteralConstant && !Inserted && !Metrics.isRecursive) continue; - int64_t Sz = *Metrics.NumInsts.getValue(); + int64_t Sz = Metrics.NumInsts.getValue(); assert(Sz > 0 && "CodeSize should be positive"); // It is safe to down cast from int64_t, NumInsts is always positive. unsigned FuncSize = static_cast(Sz); diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index e2df95ed23c10..b79fe83b23ec6 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -1320,7 +1320,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { RelativeToEntryFreq = BranchProbability(0, 1); BlockFrequency WeightedRcost = - BlockFrequency(*NonWeightedRcost.getValue()) * RelativeToEntryFreq; + BlockFrequency(NonWeightedRcost.getValue()) * RelativeToEntryFreq; // The call sequence(s) to the outlined function(s) are larger than the sum of // the original outlined region size(s), it does not increase the chances of diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 40c4c15b7120b..dd4d4efb7fecb 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -386,7 +386,7 @@ void ConstantHoistingPass::collectConstantCandidates( ConstIntCandVec.push_back(ConstantCandidate(ConstInt)); Itr->second = ConstIntCandVec.size() - 1; } - ConstIntCandVec[Itr->second].addUser(Inst, Idx, *Cost.getValue()); + ConstIntCandVec[Itr->second].addUser(Inst, Idx, Cost.getValue()); LLVM_DEBUG(if (isa(Inst->getOperand(Idx))) dbgs() << "Collect constant " << *ConstInt << " from " << *Inst << " with cost " << Cost << '\n'; @@ -446,7 +446,7 @@ void ConstantHoistingPass::collectConstantCandidates( ConstExpr)); Itr->second = ExprCandVec.size() - 1; } - ExprCandVec[Itr->second].addUser(Inst, Idx, *Cost.getValue()); + ExprCandVec[Itr->second].addUser(Inst, Idx, Cost.getValue()); } /// Check the operand for instruction Inst at index Idx. diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 838c7a1b7459d..61863bcf1337a 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -304,7 +304,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (!Metrics.NumInsts.isValid()) return MadeChange; - unsigned LoopSize = *Metrics.NumInsts.getValue(); + unsigned LoopSize = Metrics.NumInsts.getValue(); if (!LoopSize) LoopSize = 1; diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index fd16593b2e874..04719fb70552b 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1535,7 +1535,7 @@ void Cost::RateFormula(const Formula &F, C.NumBaseAdds += (F.UnfoldedOffset.isNonZero()); // Accumulate non-free scaling amounts. - C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue(); + C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue(); // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index be9b0e3244b65..d7080d6d76794 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -677,8 +677,8 @@ static std::optional analyzeLoopUnrollCost( LLVM_DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); - return {{unsigned(*UnrolledCost.getValue()), - unsigned(*RolledDynamicCost.getValue())}}; + return {{unsigned(UnrolledCost.getValue()), + unsigned(RolledDynamicCost.getValue())}}; } UnrollCostEstimator::UnrollCostEstimator( @@ -729,7 +729,7 @@ bool UnrollCostEstimator::canUnroll() const { uint64_t UnrollCostEstimator::getUnrolledLoopSize( const TargetTransformInfo::UnrollingPreferences &UP, unsigned CountOverwrite) const { - unsigned LS = *LoopSize.getValue(); + unsigned LS = LoopSize.getValue(); assert(LS >= UP.BEInsns && "LoopSize should not be less than BEInsns!"); if (CountOverwrite) return static_cast(LS - UP.BEInsns) * CountOverwrite + UP.BEInsns; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 32c3435ccb38d..f985e883d0dde 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2002,7 +2002,7 @@ class GeneratedRTChecks { InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; // Let's ensure the cost is always at least 1. - NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), + NewMemCheckCost = std::max(NewMemCheckCost.getValue(), (InstructionCost::CostType)1); if (BestTripCount > 1) @@ -5314,7 +5314,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor( - SmallLoopCost / *LoopCost.getValue())); + SmallLoopCost / LoopCost.getValue())); // Interleave until store/load ports (estimated by max interleave count) are // saturated. @@ -7659,7 +7659,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << " (Estimated cost per lane: "); if (Cost.isValid()) { - double CostPerLane = double(*Cost.getValue()) / EstimatedWidth; + double CostPerLane = double(Cost.getValue()) / EstimatedWidth; LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane)); } else /* No point dividing an invalid cost - it will still be invalid */ LLVM_DEBUG(dbgs() << "Invalid"); @@ -10478,7 +10478,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, // The scalar cost should only be 0 when vectorizing with a user specified // VF/IC. In those cases, runtime checks should always be generated. - uint64_t ScalarC = *VF.ScalarCost.getValue(); + uint64_t ScalarC = VF.ScalarCost.getValue(); if (ScalarC == 0) return true; @@ -10513,8 +10513,8 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale); - uint64_t RtC = *TotalCost.getValue(); - uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); + uint64_t RtC = TotalCost.getValue(); + uint64_t Div = ScalarC * IntVF - VF.Cost.getValue(); uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); // Second, compute a minimum iteration count so that the cost of the diff --git a/llvm/unittests/Support/InstructionCostTest.cpp b/llvm/unittests/Support/InstructionCostTest.cpp index 4e2842d8ada97..efe838897a684 100644 --- a/llvm/unittests/Support/InstructionCostTest.cpp +++ b/llvm/unittests/Support/InstructionCostTest.cpp @@ -23,7 +23,7 @@ TEST_F(CostTest, DefaultCtor) { InstructionCost DefaultCost; ASSERT_TRUE(DefaultCost.isValid()); - EXPECT_EQ(*(DefaultCost.getValue()), 0); + EXPECT_EQ(DefaultCost.getValue(), 0); } TEST_F(CostTest, Operators) { @@ -70,8 +70,7 @@ TEST_F(CostTest, Operators) { EXPECT_FALSE(TmpCost.isValid()); // Test value extraction - EXPECT_EQ(*(VThree.getValue()), 3); - EXPECT_EQ(IThreeA.getValue(), std::nullopt); + EXPECT_EQ(VThree.getValue(), 3); EXPECT_EQ(std::min(VThree, VNegTwo), -2); EXPECT_EQ(std::max(VThree, VSix), 6); From ca3a5d37ef64668234cbce7236dd640a98e2d687 Mon Sep 17 00:00:00 2001 From: jeremyd2019 Date: Tue, 22 Apr 2025 23:48:08 -0700 Subject: [PATCH 021/245] [Clang] [Driver] use __cxa_atexit by default on Cygwin. (#135701) GCC on Cygwin and MSYS2 are built with --enable-__cxa_atexit. Adjust test to expect this change. --- clang/lib/Driver/ToolChains/Clang.cpp | 4 +++- clang/test/Driver/cxa-atexit.cpp | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f2f5231933c88..f98cd389509cd 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7232,7 +7232,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // -fuse-cxa-atexit is default. if (!Args.hasFlag( options::OPT_fuse_cxa_atexit, options::OPT_fno_use_cxa_atexit, - !RawTriple.isOSAIX() && !RawTriple.isOSWindows() && + !RawTriple.isOSAIX() && + (!RawTriple.isOSWindows() || + RawTriple.isWindowsCygwinEnvironment()) && ((RawTriple.getVendor() != llvm::Triple::MipsTechnologies) || RawTriple.hasEnvironment())) || KernelOrKext) diff --git a/clang/test/Driver/cxa-atexit.cpp b/clang/test/Driver/cxa-atexit.cpp index 537a11a35f51b..8bfb938e8e8e7 100644 --- a/clang/test/Driver/cxa-atexit.cpp +++ b/clang/test/Driver/cxa-atexit.cpp @@ -1,15 +1,15 @@ // RUN: %clang -### -target armv7-unknown-windows-msvc -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS // RUN: %clang -### -target armv7-unknown-windows-itanium -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS // RUN: %clang -### -target armv7-unknown-windows-gnu -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS -// RUN: %clang -### -target armv7-unknown-windows-cygnus -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS +// RUN: %clang -### -target armv7-unknown-windows-cygnus -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-CYGWIN // RUN: %clang -### -target i686-unknown-windows-msvc -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS // RUN: %clang -### -target i686-unknown-windows-itanium -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS // RUN: %clang -### -target i686-unknown-windows-gnu -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS -// RUN: %clang -### -target i686-unknown-windows-cygnus -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS +// RUN: %clang -### -target i686-unknown-windows-cygnus -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-CYGWIN // RUN: %clang -### -target x86_64-unknown-windows-msvc -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS // RUN: %clang -### -target x86_64-unknown-windows-itanium -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS // RUN: %clang -### -target x86_64-unknown-windows-gnu -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS -// RUN: %clang -### -target x86_64-unknown-windows-cygnus -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-WINDOWS +// RUN: %clang -### -target x86_64-unknown-windows-cygnus -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-CYGWIN // RUN: %clang -### -target hexagon-unknown-none -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-HEXAGON // RUN: %clang -### -target xcore-unknown-none -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-XCORE // RUN: %clang -### -target armv7-mti-none -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-MTI @@ -21,6 +21,7 @@ // RUN: %clang -### -target powerpc64-ibm-aix-xcoff -c %s -o /dev/null 2>&1 | FileCheck %s -check-prefix CHECK-AIX // CHECK-WINDOWS: "-fno-use-cxa-atexit" +// CHECK-CYGWIN-NOT: "-fno-use-cxa-atexit" // CHECK-SOLARIS-NOT: "-fno-use-cxa-atexit" // CHECK-HEXAGON-NOT: "-fno-use-cxa-atexit" // CHECK-XCORE: "-fno-use-cxa-atexit" From 1a99f7981f16461dc8e9add411abd1218435320e Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 23 Apr 2025 14:28:53 +0800 Subject: [PATCH 022/245] [RISCV] Add tests for fixed-length vwadd[u].{w,v}v with disjoint or. NFC --- .../CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll index 9997646dce1a1..5e7d1b91d7892 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -876,3 +876,107 @@ define <2 x i64> @vwadd_v2i64_of_v2i16(ptr %x, ptr %y) { %e = add <2 x i64> %c, %d ret <2 x i64> %e } + +; %x.i32 and %y.i32 are disjoint, so DAGCombiner will combine it into an or. +define <4 x i32> @vwaddu_vv_disjoint_or_add(<4 x i8> %x.i8, <4 x i8> %y.i8) { +; CHECK-LABEL: vwaddu_vv_disjoint_or_add: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsll.vi v8, v10, 8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf4 v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: ret + %x.i16 = zext <4 x i8> %x.i8 to <4 x i16> + %x.shl = shl <4 x i16> %x.i16, splat (i16 8) + %x.i32 = zext <4 x i16> %x.shl to <4 x i32> + %y.i32 = zext <4 x i8> %y.i8 to <4 x i32> + %add = add <4 x i32> %x.i32, %y.i32 + ret <4 x i32> %add +} + +define <4 x i32> @vwaddu_vv_disjoint_or(<4 x i16> %x.i16, <4 x i16> %y.i16) { +; CHECK-LABEL: vwaddu_vv_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vor.vv v9, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: ret + %x.i32 = zext <4 x i16> %x.i16 to <4 x i32> + %y.i32 = zext <4 x i16> %y.i16 to <4 x i32> + %or = or disjoint <4 x i32> %x.i32, %y.i32 + ret <4 x i32> %or +} + +define <4 x i32> @vwadd_vv_disjoint_or(<4 x i16> %x.i16, <4 x i16> %y.i16) { +; CHECK-LABEL: vwadd_vv_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vor.vv v9, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: ret + %x.i32 = sext <4 x i16> %x.i16 to <4 x i32> + %y.i32 = sext <4 x i16> %y.i16 to <4 x i32> + %or = or disjoint <4 x i32> %x.i32, %y.i32 + ret <4 x i32> %or +} + +define <4 x i32> @vwaddu_vx_disjoint_or(<4 x i16> %x.i16, i16 %y.i16) { +; CHECK-LABEL: vwaddu_vx_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vor.vx v9, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: ret + %x.i32 = zext <4 x i16> %x.i16 to <4 x i32> + %y.head = insertelement <4 x i16> poison, i16 %y.i16, i32 0 + %y.splat = shufflevector <4 x i16> %y.head, <4 x i16> poison, <4 x i32> zeroinitializer + %y.i32 = zext <4 x i16> %y.splat to <4 x i32> + %or = or disjoint <4 x i32> %x.i32, %y.i32 + ret <4 x i32> %or +} + +define <4 x i32> @vwadd_vx_disjoint_or(<4 x i16> %x.i16, i16 %y.i16) { +; CHECK-LABEL: vwadd_vx_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vor.vx v9, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: ret + %x.i32 = sext <4 x i16> %x.i16 to <4 x i32> + %y.head = insertelement <4 x i16> poison, i16 %y.i16, i32 0 + %y.splat = shufflevector <4 x i16> %y.head, <4 x i16> poison, <4 x i32> zeroinitializer + %y.i32 = sext <4 x i16> %y.splat to <4 x i32> + %or = or disjoint <4 x i32> %x.i32, %y.i32 + ret <4 x i32> %or +} + +define <4 x i32> @vwaddu_wv_disjoint_or(<4 x i32> %x.i32, <4 x i16> %y.i16) { +; CHECK-LABEL: vwaddu_wv_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: ret + %y.i32 = zext <4 x i16> %y.i16 to <4 x i32> + %or = or disjoint <4 x i32> %x.i32, %y.i32 + ret <4 x i32> %or +} + +define <4 x i32> @vwadd_wv_disjoint_or(<4 x i32> %x.i32, <4 x i16> %y.i16) { +; CHECK-LABEL: vwadd_wv_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: ret + %y.i32 = sext <4 x i16> %y.i16 to <4 x i32> + %or = or disjoint <4 x i32> %x.i32, %y.i32 + ret <4 x i32> %or +} From da8f2d52423bb82b5d4e75cff3018704effe044f Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 22 Apr 2025 23:45:58 -0700 Subject: [PATCH 023/245] Revert "[clang-format] Allow breaking before kw___attribute (#128623)" This reverts commit 8fc8a84e23471fe56214e68706addc712b5a2949, which caused a regression. Fixes #136675. --- clang/lib/Format/TokenAnnotator.cpp | 5 +++-- clang/unittests/Format/FormatTest.cpp | 3 --- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 6d861d19117e2..3e17c688dbcce 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -6242,8 +6242,6 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, TT_ClassHeadName, tok::kw_operator)) { return true; } - if (Right.isAttribute()) - return true; if (Left.is(TT_PointerOrReference)) return false; if (Right.isTrailingComment()) { @@ -6388,6 +6386,9 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, tok::less, tok::coloncolon); } + if (Right.isAttribute()) + return true; + if (Right.is(tok::l_square) && Right.is(TT_AttributeSquare)) return Left.isNot(TT_AttributeSquare); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index f1b3b7dd8c0c3..8543c1b565d6d 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -12645,9 +12645,6 @@ TEST_F(FormatTest, UnderstandsAttributes) { verifyFormat("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa __attribute__((unused))\n" "aaaaaaaaaaaaaaaaaaaaaaa(int i);"); verifyFormat("__attribute__((nodebug)) ::qualified_type f();"); - verifyFormat( - "RenderWidgetHostViewCocoa *\n" - " __attribute__((objc_precise_lifetime)) keepSelfAlive = self;"); FormatStyle AfterType = getLLVMStyle(); AfterType.BreakAfterReturnType = FormatStyle::RTBS_All; verifyFormat("__attribute__((nodebug)) void\n" From dfc60b2ceb50e75dc07bdda18ae74695f18b370c Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Wed, 23 Apr 2025 09:01:45 +0200 Subject: [PATCH 024/245] [mlir][bazel] Also add SideEffectInterfaces dep to PtrDialect. Fix for port of e112dcc. --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 8a85c6fffd628..f6f59aa213874 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6365,6 +6365,7 @@ cc_library( ":PtrMemorySpaceInterfacesIncGen", ":PtrOpsEnumsGen", ":PtrOpsIncGen", + ":SideEffectInterfaces", ":ViewLikeInterface", "//llvm:Support", ], From 82049310385d5222527cf7d12984bd8d4f955dd1 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 23 Apr 2025 15:17:04 +0800 Subject: [PATCH 025/245] [RISCV] Add disjoint or patterns for vwadd[u].v{v,x} (#136716) DAGCombiner::hoistLogicOpWithSameOpcodeHands will hoist (or disjoint (ext a), (ext b)) -> (ext (or disjoint a, b)) So this adds patterns to match vwadd[u].v{v,x} in this case. We have to teach the combine to preserve the disjoint flag. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 ++- .../Target/RISCV/RISCVInstrInfoVSDPatterns.td | 23 ++++++++++ llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll | 44 ++++++++++++++----- 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b571f635c744f..6255922979399 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6037,7 +6037,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { LegalTypes && !TLI.isTypeDesirableForOp(LogicOpcode, XVT)) return SDValue(); // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y) - SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); + SDNodeFlags LogicFlags; + LogicFlags.setDisjoint(N->getFlags().hasDisjoint() && + ISD::isExtOpcode(HandOpcode)); + SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y, LogicFlags); if (HandOpcode == ISD::SIGN_EXTEND_INREG) return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1)); return DAG.getNode(HandOpcode, DL, VT, Logic); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index b2c5261ae6c2d..aea125c5348dd 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -912,6 +912,29 @@ defm : VPatWidenBinarySDNode_VV_VX_WV_WX; defm : VPatWidenBinarySDNode_VV_VX_WV_WX; defm : VPatWidenBinarySDNode_VV_VX_WV_WX; +// DAGCombiner::hoistLogicOpWithSameOpcodeHands may hoist disjoint ors +// to (ext (or disjoint (a, b))) +multiclass VPatWidenOrDisjoint_VV_VX { + foreach vtiToWti = AllWidenableIntVectors in { + defvar vti = vtiToWti.Vti; + defvar wti = vtiToWti.Wti; + let Predicates = !listconcat(GetVTypePredicates.Predicates, + GetVTypePredicates.Predicates) in { + def : Pat<(wti.Vector (extop (vti.Vector (or_is_add vti.RegClass:$rs2, vti.RegClass:$rs1)))), + (!cast(instruction_name#"_VV_"#vti.LMul.MX) + (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2, + vti.RegClass:$rs1, vti.AVL, vti.Log2SEW, TA_MA)>; + def : Pat<(wti.Vector (extop (vti.Vector (or_is_add vti.RegClass:$rs2, (SplatPat (XLenVT GPR:$rs1)))))), + (!cast(instruction_name#"_VX_"#vti.LMul.MX) + (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2, + GPR:$rs1, vti.AVL, vti.Log2SEW, TA_MA)>; + } + } +} +defm : VPatWidenOrDisjoint_VV_VX; +defm : VPatWidenOrDisjoint_VV_VX; +defm : VPatWidenOrDisjoint_VV_VX; + defm : VPatWidenBinarySDNode_VV_VX_WV_WX; defm : VPatWidenBinarySDNode_VV_VX_WV_WX; defm : VPatWidenBinarySDNode_VV_VX_WV_WX; diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll index 3f5d42f89337b..f94e46771f49c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll @@ -1417,15 +1417,12 @@ define @vwaddu_vv_disjoint_or_add( %x.i8, %add } -; TODO: We could select vwaddu.vv, but when both arms of the or are the same -; DAGCombiner::hoistLogicOpWithSameOpcodeHands moves the zext above the or. define @vwaddu_vv_disjoint_or( %x.i16, %y.i16) { ; CHECK-LABEL: vwaddu_vv_disjoint_or: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vor.vv v9, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x.i32 = zext %x.i16 to %y.i32 = zext %y.i16 to @@ -1433,15 +1430,12 @@ define @vwaddu_vv_disjoint_or( %x.i16, %or } -; TODO: We could select vwadd.vv, but when both arms of the or are the same -; DAGCombiner::hoistLogicOpWithSameOpcodeHands moves the zext above the or. define @vwadd_vv_disjoint_or( %x.i16, %y.i16) { ; CHECK-LABEL: vwadd_vv_disjoint_or: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vor.vv v9, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vwadd.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x.i32 = sext %x.i16 to %y.i32 = sext %y.i16 to @@ -1449,6 +1443,36 @@ define @vwadd_vv_disjoint_or( %x.i16, %or } +define @vwaddu_vx_disjoint_or( %x.i16, i16 %y.i16) { +; CHECK-LABEL: vwaddu_vx_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vwaddu.vx v9, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %x.i32 = zext %x.i16 to + %y.head = insertelement poison, i16 %y.i16, i32 0 + %y.splat = shufflevector %y.head, poison, zeroinitializer + %y.i32 = zext %y.splat to + %or = or disjoint %x.i32, %y.i32 + ret %or +} + +define @vwadd_vx_disjoint_or( %x.i16, i16 %y.i16) { +; CHECK-LABEL: vwadd_vx_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vwadd.vx v9, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %x.i32 = sext %x.i16 to + %y.head = insertelement poison, i16 %y.i16, i32 0 + %y.splat = shufflevector %y.head, poison, zeroinitializer + %y.i32 = sext %y.splat to + %or = or disjoint %x.i32, %y.i32 + ret %or +} + define @vwaddu_wv_disjoint_or( %x.i32, %y.i16) { ; CHECK-LABEL: vwaddu_wv_disjoint_or: ; CHECK: # %bb.0: From dd3de590ebd63566a1a54eb0e2140c433a9add84 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 23 Apr 2025 08:20:12 +0100 Subject: [PATCH 026/245] [CostModel] Fix InlineSizeEstimatorAnalysis after #135596 Fix a reference to getValue() being optional in InlineSizeEstimatorAnalysis, a file that is not included in the default build. A "warning: enumerated and non-enumerated type in conditional expression" warning is fixed in AMDGPU too. --- llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp | 5 +++-- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp index fcecfc795b571..fc635726a6aa4 100644 --- a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp +++ b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp @@ -128,8 +128,9 @@ size_t getSize(Function &F, TargetTransformInfo &TTI) { size_t Ret = 0; for (const auto &BB : F) for (const auto &I : BB) - Ret += *(TTI.getInstructionCost( - &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize).getValue()); + Ret += TTI.getInstructionCost( + &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize) + .getValue(); return Ret; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index 1506f02793ba4..ed9a4d9888dc4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -205,8 +205,9 @@ static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M, TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); assert(Cost != InstructionCost::getMax()); // Assume expensive if we can't tell the cost of an instruction. - CostType CostVal = Cost.isValid() ? Cost.getValue() - : TargetTransformInfo::TCC_Expensive; + CostType CostVal = Cost.isValid() + ? Cost.getValue() + : (CostType)TargetTransformInfo::TCC_Expensive; assert((FnCost + CostVal) >= FnCost && "Overflow!"); FnCost += CostVal; } From ae47f2533709058d3442a34af783d8cd721b4177 Mon Sep 17 00:00:00 2001 From: Arseniy Zaostrovnykh Date: Wed, 23 Apr 2025 09:34:54 +0200 Subject: [PATCH 027/245] [docs] Fix the use of word "dependent" and other typos in the C++ Modules Doc (#136719) "Dependant BMI" / "Dependent BMI" was used incorrectly in the documentation: "Dependent BMI" refers to a BMI that depends on the current TU, but it was used for the BMI that current TU depends on. I replaced all the mentions with "BMI dependency". --- clang/docs/StandardCPlusPlusModules.rst | 71 ++++++++++++------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst index 93edce0cf90b7..2ca014f3fd831 100644 --- a/clang/docs/StandardCPlusPlusModules.rst +++ b/clang/docs/StandardCPlusPlusModules.rst @@ -305,17 +305,17 @@ Therefore, none of the following names are valid by default: Using a reserved module name is strongly discouraged, but ``-Wno-reserved-module-identifier`` can be used to suppress the warning. -Specifying dependent BMIs -~~~~~~~~~~~~~~~~~~~~~~~~~ +Specifying BMI dependencies +~~~~~~~~~~~~~~~~~~~~~~~~~~~ -There are 3 ways to specify a dependent BMI: +There are 3 ways to specify a BMI dependency: 1. ``-fprebuilt-module-path=``. 2. ``-fmodule-file=`` (Deprecated). 3. ``-fmodule-file==``. The ``-fprebuilt-module-path`` option specifies the path to search for -dependent BMIs. Multiple paths may be specified, similar to using ``-I`` to +BMI dependencies. Multiple paths may be specified, similar to using ``-I`` to specify a search path for header files. When importing a module ``M``, the compiler looks for ``M.pcm`` in the directories specified by ``-fprebuilt-module-path``. Similarly, when importing a partition module unit @@ -337,9 +337,8 @@ When these options are specified in the same invocation of the compiler, the ``-fmodule-file==``, which takes precedence over ``-fprebuilt-module-path=``. -Note: all dependant BMIs must be specified explicitly, either directly or -indirectly dependent BMIs explicitly. See -https://github.com/llvm/llvm-project/issues/62707 for details. +Note: all BMI dependencies must be specified explicitly, either directly or +indirectly. See https://github.com/llvm/llvm-project/issues/62707 for details. When compiling a ``module implementation unit``, the BMI of the corresponding ``primary module interface unit`` must be specified because a module @@ -380,7 +379,7 @@ For example, the traditional compilation processes for headers are like: hdr2.h --, | src2.cpp -+> clang++ src2.cpp --> src2.o ---' -And the compilation process for module units are like: +And the compilation processes for module units are like: .. code-block:: text @@ -435,7 +434,7 @@ non-module-unit uses need to be consistent. Consider the following example: $ clang++ -std=c++23 Use.cpp -fprebuilt-module-path=. Clang rejects the example due to the inconsistent language standard modes. Not -all compiler options are language dialect options, though. For example: +all compiler options are language-dialect options, though. For example: .. code-block:: console @@ -465,7 +464,7 @@ translation units. Source Files Consistency ^^^^^^^^^^^^^^^^^^^^^^^^ -Clang may open the input files\ :sup:`1`` of a BMI during the compilation. This implies that +Clang may open the input files [1]_ of a BMI during the compilation. This implies that when Clang consumes a BMI, all the input files need to be present in the original path and with the original contents. @@ -477,21 +476,21 @@ When the ``-fmodules-embed-all-files`` flag are enabled, Clang explicitly emits code into the BMI file, the contents of the BMI file contain a sufficiently verbose representation to reproduce the original source file. -:sup:`1`` Input files: The source files which took part in the compilation of the BMI. -For example: +.. [1] Input files: The source files which took part in the compilation of the BMI. + For example: -.. code-block:: c++ + .. code-block:: c++ - // M.cppm - module; - #include "foo.h" - export module M; + // M.cppm + module; + #include "foo.h" + export module M; - // foo.h - #pragma once - #include "bar.h" + // foo.h + #pragma once + #include "bar.h" -The ``M.cppm``, ``foo.h`` and ``bar.h`` are input files for the BMI of ``M.cppm``. + The ``M.cppm``, ``foo.h`` and ``bar.h`` are input files for the BMI of ``M.cppm``. Object definition consistency ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -781,8 +780,8 @@ for the BMI being produced. This ensures that build systems are not required to transitively imported modules when deciding whether to recompile. What is considered to be a potential contributory BMIs is currently unspecified. -However, it is a severe bug for a BMI to remain unchanged following an observable change -that affects its consumers. +However, it is a severe bug for a BMI to remain unchanged following an +observable change in the module source files that affects the module consumers. Build systems may utilize this optimization by doing an update-if-changed operation to the BMI that is consumed from the BMI that is output by the compiler. @@ -1192,14 +1191,14 @@ them to ``your_library_imported.h`` too. Importing modules ~~~~~~~~~~~~~~~~~ -When there are dependent libraries providing modules, they should be imported -in your module as well. Many existing libraries will fall into this category -once the ``std`` module is more widely available. +When there are library dependencies providing modules, the module dependencies +should be imported in your module as well. Many existing libraries will fall +into this category once the ``std`` module is more widely available. -All dependent libraries providing modules -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +All library dependencies providing modules +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Of course, most of the complexity disappears if all the dependent libraries +Of course, most of the complexity disappears if all the library dependencies provide modules. Headers need to be converted to include third-party headers conditionally. Then, @@ -1260,8 +1259,8 @@ Non-exported ``using`` declarations are unnecessary if using implementation module units. Instead, third-party modules can be imported directly in implementation module units. -Partial dependent libraries providing modules -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Partial library dependencies providing modules +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If the library has to mix the use of ``include`` and ``import`` in its module, the primary goal is still the removal of duplicated declarations in translation @@ -1562,17 +1561,17 @@ file as a header. For example: $ clang++ -std=c++20 -fmodule-header=system -xc++-header iostream -o iostream.pcm $ clang++ -std=c++20 -fmodule-file=iostream.pcm use.cpp -How to specify dependent BMIs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How to specify BMI dependencies +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``-fmodule-file`` can be used to specify a dependent BMI (or multiple times for -more than one dependent BMI). +``-fmodule-file`` can be used to specify a BMI dependency (or multiple times for +more than one BMI dependency). With the existing implementation, ``-fprebuilt-module-path`` cannot be used for header units (because they are nominally anonymous). For header units, use ``-fmodule-file`` to include the relevant PCM file for each header unit. -This is expect to be solved in a future version of Clang either by the compiler +This is expected to be solved in a future version of Clang either by the compiler finding and specifying ``-fmodule-file`` automatically, or by the use of a module-mapper that understands how to map the header name to their PCMs. From d0cd6f3b9339326af01549ee09f17a6e9b54f505 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 23 Apr 2025 08:36:08 +0100 Subject: [PATCH 028/245] [AArch64] Fix tryToConvertShuffleOfTbl2ToTbl4 with non-buildvector input operands. (#135961) It looks like this code is only considering buildvector inputs, expecting the inputs to have at least 16 operands. This adds a check to make sure that is true. Fixes #135950 --- .../Target/AArch64/AArch64ISelLowering.cpp | 24 ++++++++------- llvm/test/CodeGen/AArch64/arm64-tbl.ll | 30 +++++++++++++++++++ 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0ff681c8dbac6..cb8f324b61187 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13872,25 +13872,27 @@ static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64); EVT VT = Op.getValueType(); - if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN || - Tbl1->getOperand(0) != Tbl2ID || - Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN || - Tbl2->getOperand(0) != Tbl2ID) + if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN || + Tbl1.getOperand(0) != Tbl2ID || + Tbl2.getOpcode() != ISD::INTRINSIC_WO_CHAIN || + Tbl2.getOperand(0) != Tbl2ID) return SDValue(); - if (Tbl1->getValueType(0) != MVT::v16i8 || - Tbl2->getValueType(0) != MVT::v16i8) + if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8) + return SDValue(); + + SDValue Mask1 = Tbl1.getOperand(3); + SDValue Mask2 = Tbl2.getOperand(3); + if (Mask1.getOpcode() != ISD::BUILD_VECTOR || + Mask2.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); - SDValue Mask1 = Tbl1->getOperand(3); - SDValue Mask2 = Tbl2->getOperand(3); SmallVector TBLMaskParts(16, SDValue()); for (unsigned I = 0; I < 16; I++) { if (ShuffleMask[I] < 16) - TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]); + TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]); else { - auto *C = - dyn_cast(Mask2->getOperand(ShuffleMask[I] - 16)); + auto *C = dyn_cast(Mask2.getOperand(ShuffleMask[I] - 16)); if (!C) return SDValue(); TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32); diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll index a854cb7fec991..fe5a6f12a49c3 100644 --- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll @@ -1254,6 +1254,36 @@ define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> % ret <16 x i8> %tmp3 } +define <16 x i8> @pr135950(<16 x i8> %A, <16 x i8> %B, <16 x i8> %M) { +; CHECK-SD-LABEL: pr135950: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov.16b v3, v1 +; CHECK-SD-NEXT: movi.2d v1, #0000000000000000 +; CHECK-SD-NEXT: mov.16b v4, v0 +; CHECK-SD-NEXT: mov.16b v5, v3 +; CHECK-SD-NEXT: tbl.16b v1, { v3, v4 }, v1 +; CHECK-SD-NEXT: tbl.16b v0, { v4, v5 }, v2 +; CHECK-SD-NEXT: zip1.16b v0, v0, v1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pr135950: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: mov.16b v3, v2 +; CHECK-GI-NEXT: movi.2d v4, #0000000000000000 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v3 +; CHECK-GI-NEXT: mov.16b v2, v0 +; CHECK-GI-NEXT: tbl.16b v0, { v1, v2 }, v4 +; CHECK-GI-NEXT: zip1.16b v0, v3, v0 +; CHECK-GI-NEXT: ret + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %M) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %B, <16 x i8> %A, <16 x i8> zeroinitializer) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + + declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone From 91edbe223177504cf878340f37a36dfcee349cab Mon Sep 17 00:00:00 2001 From: wanglei Date: Wed, 23 Apr 2025 16:15:29 +0800 Subject: [PATCH 029/245] [lldb][LoongArch] Fix expression function call failure After upgrading the default code model from small to medium on LoongArch, function calls using expression may fail. This is because the function call instruction has changed from `bl` to `pcalau18i + jirl`, but `RuntimeDyld` does not handle out-of-range jumps for this instruction sequence. This patch fixes: #136561 Reviewed By: SixWeining Pull Request: https://github.com/llvm/llvm-project/pull/136563 --- .../RuntimeDyld/RuntimeDyldELF.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index f032d5cb30f23..cca99591c8c45 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -662,7 +662,18 @@ bool RuntimeDyldELF::resolveLoongArch64ShortBranch( } uint64_t Offset = RelI->getOffset(); uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset); - if (!isInt<28>(Address + Value.Addend - SourceAddress)) + uint64_t Delta = Address + Value.Addend - SourceAddress; + // Normal call + if (RelI->getType() == ELF::R_LARCH_B26) { + if (!isInt<28>(Delta)) + return false; + resolveRelocation(Sections[SectionID], Offset, Address, RelI->getType(), + Value.Addend); + return true; + } + // Medium call: R_LARCH_CALL36 + // Range: [-128G - 0x20000, +128G - 0x20000) + if (((int64_t)Delta + 0x20000) != llvm::SignExtend64(Delta + 0x20000, 38)) return false; resolveRelocation(Sections[SectionID], Offset, Address, RelI->getType(), Value.Addend); @@ -1743,7 +1754,8 @@ RuntimeDyldELF::processRelocationRef( processSimpleRelocation(SectionID, Offset, RelType, Value); } } else if (Arch == Triple::loongarch64) { - if (RelType == ELF::R_LARCH_B26 && MemMgr.allowStubAllocation()) { + if ((RelType == ELF::R_LARCH_B26 || RelType == ELF::R_LARCH_CALL36) && + MemMgr.allowStubAllocation()) { resolveLoongArch64Branch(SectionID, Value, RelI, Stubs); } else if (RelType == ELF::R_LARCH_GOT_PC_HI20 || RelType == ELF::R_LARCH_GOT_PC_LO12) { From 8a57df6a5210d0c54ed482eb7230b7689a1f9cb9 Mon Sep 17 00:00:00 2001 From: Allin Lee <60502081+AllinLeeYL@users.noreply.github.com> Date: Wed, 23 Apr 2025 16:29:06 +0800 Subject: [PATCH 030/245] [llvm-extract] support unnamed bbs. (#135140) Dear developer: I have recently working with LLVM IR and I want to isolate basic blocks using the command "llvm-extract". However, I found that the command option "llvm-extract --bb func_name:bb_name" will only function when dumping source code into IRs with options "-fno-discard-value-names". That is to say, the "llvm-extract" command cannot support unnamed basic blocks, which is a default output of the compiler. So, I made these changes and hope they will make LLVM better. Best regards, Co-authored-by: Yilin Li --- llvm/include/llvm/IR/Value.h | 2 -- llvm/lib/IR/Value.cpp | 2 -- .../tools/llvm-extract/extract-unnamed-bb.ll | 28 +++++++++++++++++++ llvm/tools/llvm-extract/llvm-extract.cpp | 7 +++-- 4 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 llvm/test/tools/llvm-extract/extract-unnamed-bb.ll diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h index cfed12e2f5f8d..bf1de7eef9932 100644 --- a/llvm/include/llvm/IR/Value.h +++ b/llvm/include/llvm/IR/Value.h @@ -290,9 +290,7 @@ class Value { /// \note It is an error to call V->takeName(V). void takeName(Value *V); -#ifndef NDEBUG std::string getNameOrAsOperand() const; -#endif /// Change all uses of this to point to a new Value. /// diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 0eaf75b7bfaec..aa97b70f21aeb 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -441,7 +441,6 @@ void Value::takeName(Value *V) { ST->reinsertValue(this); } -#ifndef NDEBUG std::string Value::getNameOrAsOperand() const { if (!getName().empty()) return std::string(getName()); @@ -451,7 +450,6 @@ std::string Value::getNameOrAsOperand() const { printAsOperand(OS, false); return OS.str(); } -#endif void Value::assertModuleIsMaterializedImpl() const { #ifndef NDEBUG diff --git a/llvm/test/tools/llvm-extract/extract-unnamed-bb.ll b/llvm/test/tools/llvm-extract/extract-unnamed-bb.ll new file mode 100644 index 0000000000000..bb82cfdee57d9 --- /dev/null +++ b/llvm/test/tools/llvm-extract/extract-unnamed-bb.ll @@ -0,0 +1,28 @@ +; RUN: llvm-extract -S --bb "_Z6kernelv.extracted:%5" < %s | FileCheck %s + +; CHECK: define dso_local void @_Z6kernelv.extracted.extracted(i64 %0, i64 %1) { + +; CHECK 2: +; CHECK: %3 = add nuw nsw i64 %0, 1 +; CHECK-NEXT: %4 = sub nuw nsw i64 %3, %1 +; CHECK-NEXT: br label %.exitStub + +define dso_local void @_Z6kernelv.extracted(i64 %0, ptr %.out) #0 { +newFuncRoot: + br label %1 + +1: + %2 = phi i64 [ 0, %newFuncRoot ], [ %3, %1 ] + %3 = add nuw nsw i64 %2, 1 + %4 = icmp eq i64 %2, %3 + br i1 %4, label %5, label %1 + +5: + %6 = add nuw nsw i64 %0, 1 + %7 = sub nuw nsw i64 %6, %3 + br label %8 + +8: + %9 = add nuw i64 %0, 2 + ret void +} diff --git a/llvm/tools/llvm-extract/llvm-extract.cpp b/llvm/tools/llvm-extract/llvm-extract.cpp index 648060acb392c..69636ca018dcb 100644 --- a/llvm/tools/llvm-extract/llvm-extract.cpp +++ b/llvm/tools/llvm-extract/llvm-extract.cpp @@ -90,10 +90,13 @@ static cl::list ExtractBlocks( "Each pair will create a function.\n" "If multiple basic blocks are specified in one pair,\n" "the first block in the sequence should dominate the rest.\n" + "If an unnamed basic block is to be extracted,\n" + "'%' should be added before the basic block variable names.\n" "eg:\n" " --bb=f:bb1;bb2 will extract one function with both bb1 and bb2;\n" " --bb=f:bb1 --bb=f:bb2 will extract two functions, one with bb1, one " - "with bb2."), + "with bb2.\n" + " --bb=f:%1 will extract one function with basic block 1;"), cl::value_desc("function:bb1[;bb2...]"), cl::cat(ExtractCat)); // ExtractAlias - The alias to extract from the module. @@ -356,7 +359,7 @@ int main(int argc, char **argv) { // The function has been materialized, so add its matching basic blocks // to the block extractor list, or fail if a name is not found. auto Res = llvm::find_if(*P.first, [&](const BasicBlock &BB) { - return BB.getName() == BBName; + return BB.getNameOrAsOperand() == BBName; }); if (Res == P.first->end()) { errs() << argv[0] << ": function " << P.first->getName() From 6db447f824d46956172b104f08105b25f9428f55 Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Wed, 23 Apr 2025 16:31:50 +0800 Subject: [PATCH 031/245] [InstCombine] Canonicalize `max(min(X, MinC), MaxC) -> min(max(X, MaxC), MinC)` (#136665) Closes #121870. https://alive2.llvm.org/ce/z/WjmAjz https://alive2.llvm.org/ce/z/4KCjgL --- .../InstCombine/InstCombineCalls.cpp | 23 ++ .../Transforms/InstCombine/clamp-to-minmax.ll | 28 +- .../InstCombine/max-min-canonicalize.ll | 367 ++++++++++++++++++ .../Transforms/InstCombine/max_known_bits.ll | 24 +- .../Transforms/InstCombine/minmax-fold.ll | 34 +- .../InstCombine/minmax-intrinsics.ll | 14 +- llvm/test/Transforms/InstCombine/sadd_sat.ll | 40 +- .../Transforms/InstCombine/select-min-max.ll | 8 +- 8 files changed, 464 insertions(+), 74 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/max-min-canonicalize.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 38519d81fce8d..844e18dd7d8c5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1924,6 +1924,29 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } } + // smax(smin(X, MinC), MaxC) -> smin(smax(X, MaxC), MinC) if MinC s>= MaxC + // umax(umin(X, MinC), MaxC) -> umin(umax(X, MaxC), MinC) if MinC u>= MaxC + const APInt *MinC, *MaxC; + auto CreateCanonicalClampForm = [&](bool IsSigned) { + auto MaxIID = IsSigned ? Intrinsic::smax : Intrinsic::umax; + auto MinIID = IsSigned ? Intrinsic::smin : Intrinsic::umin; + Value *NewMax = Builder.CreateBinaryIntrinsic( + MaxIID, X, ConstantInt::get(X->getType(), *MaxC)); + return replaceInstUsesWith( + *II, Builder.CreateBinaryIntrinsic( + MinIID, NewMax, ConstantInt::get(X->getType(), *MinC))); + }; + if (IID == Intrinsic::smax && + match(I0, m_OneUse(m_Intrinsic(m_Value(X), + m_APInt(MinC)))) && + match(I1, m_APInt(MaxC)) && MinC->sgt(*MaxC)) + return CreateCanonicalClampForm(true); + if (IID == Intrinsic::umax && + match(I0, m_OneUse(m_Intrinsic(m_Value(X), + m_APInt(MinC)))) && + match(I1, m_APInt(MaxC)) && MinC->ugt(*MaxC)) + return CreateCanonicalClampForm(false); + // umin(i1 X, i1 Y) -> and i1 X, Y // smax(i1 X, i1 Y) -> and i1 X, Y if ((IID == Intrinsic::umin || IID == Intrinsic::smax) && diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll index 478d437847127..b557c0dbe2629 100644 --- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll +++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll @@ -83,7 +83,7 @@ define float @clamp_float_fast_unordered_strict_maxmin(float %x) { ; (X <= C1) ? C1 : MIN(X, C2) define float @clamp_float_fast_unordered_nonstrict_maxmin(float %x) { ; CHECK-LABEL: @clamp_float_fast_unordered_nonstrict_maxmin( -; CHECK-NEXT: [[MIN:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) +; CHECK-NEXT: [[MIN:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) ; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast ule float [[X]], 1.000000e+00 ; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP1]], float 1.000000e+00, float [[MIN]] ; CHECK-NEXT: ret float [[R]] @@ -98,7 +98,7 @@ define float @clamp_float_fast_unordered_nonstrict_maxmin(float %x) { ; (X > C1) ? C1 : MAX(X, C2) define float @clamp_float_fast_unordered_strict_minmax(float %x) { ; CHECK-LABEL: @clamp_float_fast_unordered_strict_minmax( -; CHECK-NEXT: [[MAX:%.*]] = call fast float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00) +; CHECK-NEXT: [[MAX:%.*]] = call fast float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00) ; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast ugt float [[X]], 2.550000e+02 ; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP1]], float 2.550000e+02, float [[MAX]] ; CHECK-NEXT: ret float [[R]] @@ -113,7 +113,7 @@ define float @clamp_float_fast_unordered_strict_minmax(float %x) { ; (X >= C1) ? C1 : MAX(X, C2) define float @clamp_float_fast_unordered_nonstrict_minmax(float %x) { ; CHECK-LABEL: @clamp_float_fast_unordered_nonstrict_minmax( -; CHECK-NEXT: [[MAX:%.*]] = call fast float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00) +; CHECK-NEXT: [[MAX:%.*]] = call fast float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00) ; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast uge float [[X]], 2.550000e+02 ; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP1]], float 2.550000e+02, float [[MAX]] ; CHECK-NEXT: ret float [[R]] @@ -147,7 +147,7 @@ define float @clamp_test_1(float %x) { ; Like @clamp_test_1 but HighConst < LowConst define float @clamp_negative_wrong_const(float %x) { ; CHECK-LABEL: @clamp_negative_wrong_const( -; CHECK-NEXT: [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) +; CHECK-NEXT: [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) ; CHECK-NEXT: [[OUTER_CMP:%.*]] = fcmp fast ugt float [[X]], 5.120000e+02 ; CHECK-NEXT: [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 5.120000e+02 ; CHECK-NEXT: ret float [[R]] @@ -162,7 +162,7 @@ define float @clamp_negative_wrong_const(float %x) { ; Like @clamp_test_1 but both are min define float @clamp_negative_same_op(float %x) { ; CHECK-LABEL: @clamp_negative_same_op( -; CHECK-NEXT: [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) +; CHECK-NEXT: [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) ; CHECK-NEXT: [[OUTER_CMP:%.*]] = fcmp fast ult float [[X]], 1.000000e+00 ; CHECK-NEXT: [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 1.000000e+00 ; CHECK-NEXT: ret float [[R]] @@ -500,9 +500,9 @@ define float @ui64_clamp_and_cast_to_float(i64 %x) { define float @mixed_clamp_to_float_1(i32 %x) { ; CHECK-LABEL: @mixed_clamp_to_float_1( -; CHECK-NEXT: [[SI_MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255) -; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN]], i32 1) -; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[R1]] to float +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN:%.*]], i32 1) +; CHECK-NEXT: [[R2:%.*]] = call i32 @llvm.smin.i32(i32 [[R1]], i32 255) +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[R2]] to float ; CHECK-NEXT: ret float [[R]] ; %si_min_cmp = icmp sgt i32 %x, 255 @@ -535,9 +535,9 @@ define i32 @mixed_clamp_to_i32_1(float %x) { define float @mixed_clamp_to_float_2(i32 %x) { ; CHECK-LABEL: @mixed_clamp_to_float_2( -; CHECK-NEXT: [[SI_MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255) -; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN]], i32 1) -; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[R1]] to float +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN:%.*]], i32 1) +; CHECK-NEXT: [[R2:%.*]] = call i32 @llvm.smin.i32(i32 [[R1]], i32 255) +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[R2]] to float ; CHECK-NEXT: ret float [[R]] ; %si_min_cmp = icmp sgt i32 %x, 255 @@ -568,9 +568,9 @@ define i32 @mixed_clamp_to_i32_2(float %x) { define <2 x float> @mixed_clamp_to_float_vec(<2 x i32> %x) { ; CHECK-LABEL: @mixed_clamp_to_float_vec( -; CHECK-NEXT: [[SI_MIN:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[X:%.*]], <2 x i32> splat (i32 255)) -; CHECK-NEXT: [[R1:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[SI_MIN]], <2 x i32> splat (i32 1)) -; CHECK-NEXT: [[R:%.*]] = uitofp nneg <2 x i32> [[R1]] to <2 x float> +; CHECK-NEXT: [[R1:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[SI_MIN:%.*]], <2 x i32> splat (i32 1)) +; CHECK-NEXT: [[R2:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[R1]], <2 x i32> splat (i32 255)) +; CHECK-NEXT: [[R:%.*]] = uitofp nneg <2 x i32> [[R2]] to <2 x float> ; CHECK-NEXT: ret <2 x float> [[R]] ; %si_min_cmp = icmp sgt <2 x i32> %x, diff --git a/llvm/test/Transforms/InstCombine/max-min-canonicalize.ll b/llvm/test/Transforms/InstCombine/max-min-canonicalize.ll new file mode 100644 index 0000000000000..d438c07cf2fbe --- /dev/null +++ b/llvm/test/Transforms/InstCombine/max-min-canonicalize.ll @@ -0,0 +1,367 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +; smax(smin(X, MinC), MaxC) -> smin(smax(X, MaxC), MinC) +; umax(umin(X, MinC), MaxC) -> umin(smax(X, MaxC), MinC) + + +define i16 @smax_smin(i16 %x) { +; CHECK-LABEL: define i16 @smax_smin( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.smax.i16(i16 [[X]], i16 -1) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.smin.i16(i16 [[TMP1]], i16 255) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.smin.i16(i16 %x, i16 255) + %max = call i16 @llvm.smax.i16(i16 %min, i16 -1) + ret i16 %max +} + +define i16 @umax_umin(i16 %x) { +; CHECK-LABEL: define i16 @umax_umin( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.umax.i16(i16 [[X]], i16 63) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umin.i16(i16 [[TMP1]], i16 255) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.umin.i16(i16 %x, i16 255) + %max = call i16 @llvm.umax.i16(i16 %min, i16 63) + ret i16 %max +} + +define i16 @smax_smin_commute0(i16 %x) { +; CHECK-LABEL: define i16 @smax_smin_commute0( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.smax.i16(i16 [[X]], i16 -128) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.smin.i16(i16 [[TMP1]], i16 127) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.smin.i16(i16 %x, i16 127) + %max = call i16 @llvm.smax.i16(i16 %min, i16 -128) + ret i16 %max +} + +define i16 @umax_umin_commute0(i16 %x) { +; CHECK-LABEL: define i16 @umax_umin_commute0( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.umax.i16(i16 [[X]], i16 127) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umin.i16(i16 [[TMP1]], i16 255) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.umin.i16(i16 %x, i16 255) + %max = call i16 @llvm.umax.i16(i16 %min, i16 127) + ret i16 %max +} + +define i16 @smax_smin_commute1(i16 %x) { +; CHECK-LABEL: define i16 @smax_smin_commute1( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.smax.i16(i16 [[X]], i16 -128) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.smin.i16(i16 [[TMP1]], i16 127) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.smin.i16(i16 127, i16 %x) + %max = call i16 @llvm.smax.i16(i16 %min, i16 -128) + ret i16 %max +} + +define i16 @umax_umin_commute1(i16 %x) { +; CHECK-LABEL: define i16 @umax_umin_commute1( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.umax.i16(i16 [[X]], i16 127) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umin.i16(i16 [[TMP1]], i16 255) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.umin.i16(i16 255, i16 %x) + %max = call i16 @llvm.umax.i16(i16 %min, i16 127) + ret i16 %max +} + +define i16 @smax_smin_commute2(i16 %x) { +; CHECK-LABEL: define i16 @smax_smin_commute2( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.smax.i16(i16 [[X]], i16 -128) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.smin.i16(i16 [[TMP1]], i16 127) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.smin.i16(i16 %x, i16 127) + %max = call i16 @llvm.smax.i16(i16 -128, i16 %min) + ret i16 %max +} + +define i16 @umax_umin_commute2(i16 %x) { +; CHECK-LABEL: define i16 @umax_umin_commute2( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.umax.i16(i16 [[X]], i16 127) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umin.i16(i16 [[TMP1]], i16 255) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.umin.i16(i16 %x, i16 255) + %max = call i16 @llvm.umax.i16(i16 127, i16 %min) + ret i16 %max +} + +define i16 @smax_smin_commute3(i16 %x) { +; CHECK-LABEL: define i16 @smax_smin_commute3( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.smax.i16(i16 [[X]], i16 -128) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.smin.i16(i16 [[TMP1]], i16 127) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.smin.i16(i16 %x, i16 127) + %max = call i16 @llvm.smax.i16(i16 %min, i16 -128) + ret i16 %max +} + +define i16 @umax_umin_commute3(i16 %x) { +; CHECK-LABEL: define i16 @umax_umin_commute3( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.umax.i16(i16 [[X]], i16 127) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umin.i16(i16 [[TMP1]], i16 255) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.umin.i16(i16 %x, i16 255) + %max = call i16 @llvm.umax.i16(i16 %min, i16 127) + ret i16 %max +} + +define <2 x i16> @smax_smin_v2i16(<2 x i16> %x) { +; CHECK-LABEL: define <2 x i16> @smax_smin_v2i16( +; CHECK-SAME: <2 x i16> [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[X]], <2 x i16> splat (i16 -128)) +; CHECK-NEXT: [[MAX:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP1]], <2 x i16> splat (i16 127)) +; CHECK-NEXT: ret <2 x i16> [[MAX]] +; + %min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %x, <2 x i16> splat (i16 127)) + %max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %min, <2 x i16> splat (i16 -128)) + ret <2 x i16> %max +} + +define <2 x i16> @umax_umin_v2i16(<2 x i16> %x) { +; CHECK-LABEL: define <2 x i16> @umax_umin_v2i16( +; CHECK-SAME: <2 x i16> [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[X]], <2 x i16> splat (i16 127)) +; CHECK-NEXT: [[MAX:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP1]], <2 x i16> splat (i16 255)) +; CHECK-NEXT: ret <2 x i16> [[MAX]] +; + %min = call <2 x i16> @llvm.umin.v2i16(<2 x i16> %x, <2 x i16> splat (i16 255)) + %max = call <2 x i16> @llvm.umax.v2i16(<2 x i16> %min, <2 x i16> splat (i16 127)) + ret <2 x i16> %max +} + +define <2 x i8> @smax_smin_v2i16_nonsplat_1(<2 x i8> %a) { +; CHECK-LABEL: define <2 x i8> @smax_smin_v2i16_nonsplat_1( +; CHECK-SAME: <2 x i8> [[A:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call <2 x i8> @llvm.smin.v2i8(<2 x i8> [[A]], <2 x i8> ) +; CHECK-NEXT: [[MAX:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[MIN]], <2 x i8> ) +; CHECK-NEXT: ret <2 x i8> [[MAX]] +; + %min = call <2 x i8> @llvm.smin(<2 x i8> %a, <2 x i8> ) + %max = call <2 x i8> @llvm.smax(<2 x i8> %min, <2 x i8> ) + ret <2 x i8> %max +} + +define <2 x i16> @umax_umin_v2i16_nonsplat_1(<2 x i16> %a) { +; CHECK-LABEL: define <2 x i16> @umax_umin_v2i16_nonsplat_1( +; CHECK-SAME: <2 x i16> [[A:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[A]], <2 x i16> ) +; CHECK-NEXT: [[MAX:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[MIN]], <2 x i16> ) +; CHECK-NEXT: ret <2 x i16> [[MAX]] +; + %min = call <2 x i16> @llvm.umin(<2 x i16> %a, <2 x i16> ) + %max = call <2 x i16> @llvm.umax(<2 x i16> %min, <2 x i16> ) + ret <2 x i16> %max +} + +define <2 x i8> @smax_smin_v2i16_nonsplat_2(<2 x i8> %a) { +; CHECK-LABEL: define <2 x i8> @smax_smin_v2i16_nonsplat_2( +; CHECK-SAME: <2 x i8> [[A:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call <2 x i8> @llvm.smin.v2i8(<2 x i8> [[A]], <2 x i8> ) +; CHECK-NEXT: [[MAX:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[MIN]], <2 x i8> ) +; CHECK-NEXT: ret <2 x i8> [[MAX]] +; + %min = call <2 x i8> @llvm.smin(<2 x i8> %a, <2 x i8> ) + %max = call <2 x i8> @llvm.smax(<2 x i8> %min, <2 x i8> ) + ret <2 x i8> %max +} + +define <2 x i16> @umax_umin_v2i16_nonsplat_2(<2 x i16> %a) { +; CHECK-LABEL: define <2 x i16> @umax_umin_v2i16_nonsplat_2( +; CHECK-SAME: <2 x i16> [[A:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[A]], <2 x i16> ) +; CHECK-NEXT: [[MAX:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[MIN]], <2 x i16> ) +; CHECK-NEXT: ret <2 x i16> [[MAX]] +; + %min = call <2 x i16> @llvm.umin(<2 x i16> %a, <2 x i16> ) + %max = call <2 x i16> @llvm.umax(<2 x i16> %min, <2 x i16> ) + ret <2 x i16> %max +} + +declare void @use(i16) + +define i16 @smax_smin_used(i16 %x) { +; CHECK-LABEL: define i16 @smax_smin_used( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i16 @llvm.smin.i16(i16 [[X]], i16 127) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.smax.i16(i16 [[MIN]], i16 -128) +; CHECK-NEXT: call void @use(i16 [[MIN]]) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.smin.i16(i16 %x, i16 127) + %max = call i16 @llvm.smax.i16(i16 %min, i16 -128) + call void @use(i16 %min) + ret i16 %max +} + +define i16 @umax_umin_used(i16 %x) { +; CHECK-LABEL: define i16 @umax_umin_used( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i16 @llvm.umin.i16(i16 [[X]], i16 255) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umax.i16(i16 [[MIN]], i16 127) +; CHECK-NEXT: call void @use(i16 [[MIN]]) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.umin.i16(i16 %x, i16 255) + %max = call i16 @llvm.umax.i16(i16 %min, i16 127) + call void @use(i16 %min) + ret i16 %max +} + + +define i16 @smax_smin_i16_limits(i16 %x) { +; CHECK-LABEL: define i16 @smax_smin_i16_limits( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: ret i16 [[X]] +; + %min = call i16 @llvm.smin.i16(i16 %x, i16 32767) + %max = call i16 @llvm.smax.i16(i16 %min, i16 -32768) + ret i16 %max +} + +define i16 @umax_umin_i16_limits(i16 %x) { +; CHECK-LABEL: define i16 @umax_umin_i16_limits( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: ret i16 [[X]] +; + %min = call i16 @llvm.umin.i16(i16 %x, i16 65535) + %max = call i16 @llvm.umax.i16(i16 %min, i16 0) + ret i16 %max +} + +define i16 @min_smax_smin_nested(i16 %x) { +; CHECK-LABEL: define i16 @min_smax_smin_nested( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.smax.i16(i16 [[X]], i16 -128) +; CHECK-NEXT: [[MIN2:%.*]] = call i16 @llvm.smin.i16(i16 [[TMP1]], i16 127) +; CHECK-NEXT: ret i16 [[MIN2]] +; + %min1 = call i16 @llvm.smin.i16(i16 %x, i16 255) + %max = call i16 @llvm.smax.i16(i16 %min1, i16 -128) + %min2 = call i16 @llvm.smin.i16(i16 %max, i16 127) + ret i16 %min2 +} + +define i16 @umax_umin_nested(i16 %x) { +; CHECK-LABEL: define i16 @umax_umin_nested( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umax.i16(i16 [[X]], i16 63) +; CHECK-NEXT: [[MIN2:%.*]] = call i16 @llvm.umin.i16(i16 [[MAX]], i16 127) +; CHECK-NEXT: ret i16 [[MIN2]] +; + %min1 = call i16 @llvm.umin.i16(i16 %x, i16 255) + %max = call i16 @llvm.umax.i16(i16 %min1, i16 63) + %min2 = call i16 @llvm.umin.i16(i16 %max, i16 127) + ret i16 %min2 +} + +define i16 @smax_smin_cmax_sgt_cmin(i16 %x) { +; CHECK-LABEL: define i16 @smax_smin_cmax_sgt_cmin( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: ret i16 127 +; + %min = call i16 @llvm.smin.i16(i16 %x, i16 126) + %max = call i16 @llvm.smax.i16(i16 %min, i16 127) + ret i16 %max +} + +define i16 @umax_umin_cmax_ugt_cmin(i16 %x) { +; CHECK-LABEL: define i16 @umax_umin_cmax_ugt_cmin( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: ret i16 127 +; + %min = call i16 @llvm.umin.i16(i16 %x, i16 126) + %max = call i16 @llvm.umax.i16(i16 %min, i16 127) + ret i16 %max +} + +define i16 @smax_smin_variables(i16 %x, i16 %y, i16 %z) { +; CHECK-LABEL: define i16 @smax_smin_variables( +; CHECK-SAME: i16 [[X:%.*]], i16 [[Y:%.*]], i16 [[Z:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i16 @llvm.smin.i16(i16 [[X]], i16 [[Y]]) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.smax.i16(i16 [[MIN]], i16 [[Z]]) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.smin.i16(i16 %x, i16 %y) + %max = call i16 @llvm.smax.i16(i16 %min, i16 %z) + ret i16 %max +} + +define i16 @umax_umin_variables(i16 %x, i16 %y, i16 %z) { +; CHECK-LABEL: define i16 @umax_umin_variables( +; CHECK-SAME: i16 [[X:%.*]], i16 [[Y:%.*]], i16 [[Z:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i16 @llvm.umin.i16(i16 [[X]], i16 [[Y]]) +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umax.i16(i16 [[MIN]], i16 [[Z]]) +; CHECK-NEXT: ret i16 [[MAX]] +; + %min = call i16 @llvm.umin.i16(i16 %x, i16 %y) + %max = call i16 @llvm.umax.i16(i16 %min, i16 %z) + ret i16 %max +} + +define i16 @smin_smax(i16 %x) { +; CHECK-LABEL: define i16 @smin_smax( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.smax.i16(i16 [[X]], i16 -128) +; CHECK-NEXT: [[MIN:%.*]] = call i16 @llvm.smin.i16(i16 [[MAX]], i16 127) +; CHECK-NEXT: ret i16 [[MIN]] +; + %max = call i16 @llvm.smax.i16(i16 %x, i16 -128) + %min = call i16 @llvm.smin.i16(i16 %max, i16 127) + ret i16 %min +} + +define i16 @umin_umax(i16 %x) { +; CHECK-LABEL: define i16 @umin_umax( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umax.i16(i16 [[X]], i16 127) +; CHECK-NEXT: [[MIN:%.*]] = call i16 @llvm.umin.i16(i16 [[MAX]], i16 255) +; CHECK-NEXT: ret i16 [[MIN]] +; + %max = call i16 @llvm.umax.i16(i16 %x, i16 127) + %min = call i16 @llvm.umin.i16(i16 %max, i16 255) + ret i16 %min +} + +define i16 @smin_umax(i16 %x) { +; CHECK-LABEL: define i16 @smin_umax( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.umax.i16(i16 [[X]], i16 127) +; CHECK-NEXT: [[MIN:%.*]] = call i16 @llvm.smin.i16(i16 [[MAX]], i16 255) +; CHECK-NEXT: ret i16 [[MIN]] +; + %max = call i16 @llvm.umax.i16(i16 %x, i16 127) + %min = call i16 @llvm.smin.i16(i16 %max, i16 255) + ret i16 %min +} + +define i16 @umin_smax(i16 %x) { +; CHECK-LABEL: define i16 @umin_smax( +; CHECK-SAME: i16 [[X:%.*]]) { +; CHECK-NEXT: [[MAX:%.*]] = call i16 @llvm.smax.i16(i16 [[X]], i16 -128) +; CHECK-NEXT: [[MIN:%.*]] = call i16 @llvm.umin.i16(i16 [[MAX]], i16 127) +; CHECK-NEXT: ret i16 [[MIN]] +; + %max = call i16 @llvm.smax.i16(i16 %x, i16 -128) + %min = call i16 @llvm.umin.i16(i16 %max, i16 127) + ret i16 %min +} diff --git a/llvm/test/Transforms/InstCombine/max_known_bits.ll b/llvm/test/Transforms/InstCombine/max_known_bits.ll index 3eb53b32efecc..162abf0efb7cc 100644 --- a/llvm/test/Transforms/InstCombine/max_known_bits.ll +++ b/llvm/test/Transforms/InstCombine/max_known_bits.ll @@ -35,9 +35,9 @@ define i16 @min_max_clamp(i16 %x) { ; Same as above with min/max reversed. define i16 @min_max_clamp_2(i16 %x) { ; CHECK-LABEL: @min_max_clamp_2( -; CHECK-NEXT: [[B:%.*]] = call i16 @llvm.smin.i16(i16 [[X:%.*]], i16 2047) -; CHECK-NEXT: [[D:%.*]] = call i16 @llvm.smax.i16(i16 [[B]], i16 -2048) -; CHECK-NEXT: [[E:%.*]] = add nsw i16 [[D]], 1 +; CHECK-NEXT: [[D:%.*]] = call i16 @llvm.smax.i16(i16 [[B:%.*]], i16 -2048) +; CHECK-NEXT: [[D1:%.*]] = call i16 @llvm.smin.i16(i16 [[D]], i16 2047) +; CHECK-NEXT: [[E:%.*]] = add nsw i16 [[D1]], 1 ; CHECK-NEXT: ret i16 [[E]] ; %a = icmp slt i16 %x, 2047 @@ -71,9 +71,9 @@ define i32 @min_max_clamp_3(i16 %x) { ; Same as above with min/max order reversed define i32 @min_max_clamp_4(i16 %x) { ; CHECK-LABEL: @min_max_clamp_4( -; CHECK-NEXT: [[B:%.*]] = call i16 @llvm.smin.i16(i16 [[X:%.*]], i16 2047) -; CHECK-NEXT: [[D:%.*]] = call i16 @llvm.smax.i16(i16 [[B]], i16 -2048) -; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[D]] to i32 +; CHECK-NEXT: [[D:%.*]] = call i16 @llvm.smax.i16(i16 [[B:%.*]], i16 -2048) +; CHECK-NEXT: [[D1:%.*]] = call i16 @llvm.smin.i16(i16 [[D]], i16 2047) +; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[D1]] to i32 ; CHECK-NEXT: ret i32 [[TMP1]] ; %a = icmp slt i16 %x, 2047 @@ -106,9 +106,9 @@ define i16 @min_max_clamp_intrinsic(i16 %x) { define i16 @min_max_clamp_intrinsic_2(i16 %x) { ; CHECK-LABEL: @min_max_clamp_intrinsic_2( -; CHECK-NEXT: [[A:%.*]] = call i16 @llvm.smin.i16(i16 [[X:%.*]], i16 2047) -; CHECK-NEXT: [[B:%.*]] = call i16 @llvm.smax.i16(i16 [[A]], i16 -2048) -; CHECK-NEXT: [[C:%.*]] = add nsw i16 [[B]], 1 +; CHECK-NEXT: [[B:%.*]] = call i16 @llvm.smax.i16(i16 [[A:%.*]], i16 -2048) +; CHECK-NEXT: [[B1:%.*]] = call i16 @llvm.smin.i16(i16 [[B]], i16 2047) +; CHECK-NEXT: [[C:%.*]] = add nsw i16 [[B1]], 1 ; CHECK-NEXT: ret i16 [[C]] ; %a = call i16 @llvm.smin.i16(i16 %x, i16 2047) @@ -134,9 +134,9 @@ define i32 @min_max_clamp_intrinsic_3(i16 %x) { define i32 @min_max_clamp_intrinsic_4(i16 %x) { ; CHECK-LABEL: @min_max_clamp_intrinsic_4( -; CHECK-NEXT: [[A:%.*]] = call i16 @llvm.smin.i16(i16 [[X:%.*]], i16 2047) -; CHECK-NEXT: [[B:%.*]] = call i16 @llvm.smax.i16(i16 [[A]], i16 -2048) -; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[B]] to i32 +; CHECK-NEXT: [[B:%.*]] = call i16 @llvm.smax.i16(i16 [[A:%.*]], i16 -2048) +; CHECK-NEXT: [[B1:%.*]] = call i16 @llvm.smin.i16(i16 [[B]], i16 2047) +; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[B1]] to i32 ; CHECK-NEXT: ret i32 [[TMP1]] ; %a = call i16 @llvm.smin.i16(i16 %x, i16 2047) diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll index 4d66e261c649c..cd376b74fb36c 100644 --- a/llvm/test/Transforms/InstCombine/minmax-fold.ll +++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll @@ -346,9 +346,9 @@ define i32 @test75(i32 %x) { define i32 @clamp_signed1(i32 %x) { ; CHECK-LABEL: @clamp_signed1( -; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255) -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.smax.i32(i32 [[MIN]], i32 15) -; CHECK-NEXT: ret i32 [[R]] +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.smax.i32(i32 [[MIN:%.*]], i32 15) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.smin.i32(i32 [[R]], i32 255) +; CHECK-NEXT: ret i32 [[R1]] ; %cmp2 = icmp slt i32 %x, 255 %min = select i1 %cmp2, i32 %x, i32 255 @@ -376,9 +376,9 @@ define i32 @clamp_signed2(i32 %x) { define i32 @clamp_signed3(i32 %x) { ; CHECK-LABEL: @clamp_signed3( -; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255) -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.smax.i32(i32 [[MIN]], i32 15) -; CHECK-NEXT: ret i32 [[R]] +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.smax.i32(i32 [[MIN:%.*]], i32 15) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.smin.i32(i32 [[R]], i32 255) +; CHECK-NEXT: ret i32 [[R1]] ; %cmp2 = icmp slt i32 %x, 255 %min = select i1 %cmp2, i32 %x, i32 255 @@ -406,9 +406,9 @@ define i32 @clamp_signed4(i32 %x) { define i32 @clamp_unsigned1(i32 %x) { ; CHECK-LABEL: @clamp_unsigned1( -; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 255) -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.umax.i32(i32 [[MIN]], i32 15) -; CHECK-NEXT: ret i32 [[R]] +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.umax.i32(i32 [[MIN:%.*]], i32 15) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.umin.i32(i32 [[R]], i32 255) +; CHECK-NEXT: ret i32 [[R1]] ; %cmp2 = icmp ult i32 %x, 255 %min = select i1 %cmp2, i32 %x, i32 255 @@ -436,9 +436,9 @@ define i32 @clamp_unsigned2(i32 %x) { define i32 @clamp_unsigned3(i32 %x) { ; CHECK-LABEL: @clamp_unsigned3( -; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 255) -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.umax.i32(i32 [[MIN]], i32 15) -; CHECK-NEXT: ret i32 [[R]] +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.umax.i32(i32 [[MIN:%.*]], i32 15) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.umin.i32(i32 [[R]], i32 255) +; CHECK-NEXT: ret i32 [[R1]] ; %cmp2 = icmp ult i32 %x, 255 %min = select i1 %cmp2, i32 %x, i32 255 @@ -467,9 +467,9 @@ define i32 @clamp_unsigned4(i32 %x) { ; (icmp sgt smin(PositiveA, B) 0) -> (icmp sgt B 0) define i32 @clamp_check_for_no_infinite_loop1(i32 %i) { ; CHECK-LABEL: @clamp_check_for_no_infinite_loop1( -; CHECK-NEXT: [[SEL1:%.*]] = call i32 @llvm.smin.i32(i32 [[I:%.*]], i32 255) -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.smax.i32(i32 [[SEL1]], i32 0) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.smax.i32(i32 [[SEL1:%.*]], i32 0) +; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.smin.i32(i32 [[RES]], i32 255) +; CHECK-NEXT: ret i32 [[RES1]] ; %cmp1 = icmp slt i32 %i, 255 %sel1 = select i1 %cmp1, i32 %i, i32 255 @@ -1429,8 +1429,8 @@ define i8 @PR46271(<2 x i8> %x) { define i32 @twoway_clamp_lt(i32 %num) { ; CHECK-LABEL: @twoway_clamp_lt( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[NUM:%.*]], 13767 -; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP0]], i32 13768, i32 13767 +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt i32 [[NUM:%.*]], 13768 +; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP0]], i32 13767, i32 13768 ; CHECK-NEXT: ret i32 [[R]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll index 0b7127f82b612..9a8608da9fd5b 100644 --- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll @@ -774,8 +774,8 @@ define i8 @clamp_two_vals_smax_smin(i8 %x) { define <3 x i8> @clamp_two_vals_smin_smax(<3 x i8> %x) { ; CHECK-LABEL: @clamp_two_vals_smin_smax( -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <3 x i8> [[X:%.*]], splat (i8 41) -; CHECK-NEXT: [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i8> splat (i8 42), <3 x i8> splat (i8 41) +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <3 x i8> [[X:%.*]], splat (i8 42) +; CHECK-NEXT: [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i8> splat (i8 41), <3 x i8> splat (i8 42) ; CHECK-NEXT: ret <3 x i8> [[R]] ; %m = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %x, <3 x i8> ) @@ -796,8 +796,8 @@ define i8 @clamp_two_vals_umax_umin(i8 %x) { define i8 @clamp_two_vals_umin_umax(i8 %x) { ; CHECK-LABEL: @clamp_two_vals_umin_umax( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i8 [[X:%.*]], 41 -; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP1]], i8 42, i8 41 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i8 [[X:%.*]], 42 +; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP1]], i8 41, i8 42 ; CHECK-NEXT: ret i8 [[R]] ; %m = call i8 @llvm.umin.i8(i8 %x, i8 42) @@ -2192,9 +2192,9 @@ define i8 @umin_umin_reassoc_constants(i8 %x) { define i8 @smin_smax_reassoc_constants(i8 %x) { ; CHECK-LABEL: @smin_smax_reassoc_constants( -; CHECK-NEXT: [[M1:%.*]] = call i8 @llvm.smin.i8(i8 [[X:%.*]], i8 97) -; CHECK-NEXT: [[M2:%.*]] = call i8 @llvm.smax.i8(i8 [[M1]], i8 -3) -; CHECK-NEXT: ret i8 [[M2]] +; CHECK-NEXT: [[M2:%.*]] = call i8 @llvm.smax.i8(i8 [[M1:%.*]], i8 -3) +; CHECK-NEXT: [[M3:%.*]] = call i8 @llvm.smin.i8(i8 [[M2]], i8 97) +; CHECK-NEXT: ret i8 [[M3]] ; %m1 = call i8 @llvm.smin.i8(i8 %x, i8 97) %m2 = call i8 @llvm.smax.i8(i8 %m1, i8 -3) diff --git a/llvm/test/Transforms/InstCombine/sadd_sat.ll b/llvm/test/Transforms/InstCombine/sadd_sat.ll index d27e7aa28d62c..6afb77d975b8c 100644 --- a/llvm/test/Transforms/InstCombine/sadd_sat.ll +++ b/llvm/test/Transforms/InstCombine/sadd_sat.ll @@ -77,8 +77,8 @@ define i32 @smul_sat32(i32 %a, i32 %b) { ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 ; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = mul nsw i64 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.smax.i64(i64 [[ADD]], i64 -2147483648) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP0]], i64 2147483647) ; CHECK-NEXT: [[CONV7:%.*]] = trunc nsw i64 [[SPEC_STORE_SELECT8]] to i32 ; CHECK-NEXT: ret i32 [[CONV7]] ; @@ -100,8 +100,8 @@ define i32 @smul_sat32_mm(i32 %a, i32 %b) { ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 ; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = mul nsw i64 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.smax.i64(i64 [[ADD]], i64 -2147483648) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP0]], i64 2147483647) ; CHECK-NEXT: [[CONV7:%.*]] = trunc nsw i64 [[SPEC_STORE_SELECT8]] to i32 ; CHECK-NEXT: ret i32 [[CONV7]] ; @@ -293,8 +293,8 @@ define signext i4 @sadd_sat4(i4 signext %a, i4 signext %b) { ; CHECK-NEXT: [[CONV:%.*]] = sext i4 [[A:%.*]] to i32 ; CHECK-NEXT: [[CONV1:%.*]] = sext i4 [[B:%.*]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[ADD]], i32 7) -; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -8) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.smax.i32(i32 [[ADD]], i32 -8) +; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP0]], i32 7) ; CHECK-NEXT: [[CONV9:%.*]] = trunc nsw i32 [[SPEC_STORE_SELECT10]] to i4 ; CHECK-NEXT: ret i4 [[CONV9]] ; @@ -316,8 +316,8 @@ define signext i4 @ssub_sat4(i4 signext %a, i4 signext %b) { ; CHECK-NEXT: [[CONV:%.*]] = sext i4 [[A:%.*]] to i32 ; CHECK-NEXT: [[CONV1:%.*]] = sext i4 [[B:%.*]] to i32 ; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV1]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[SUB]], i32 7) -; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -8) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.smax.i32(i32 [[SUB]], i32 -8) +; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP0]], i32 7) ; CHECK-NEXT: [[CONV9:%.*]] = trunc nsw i32 [[SPEC_STORE_SELECT10]] to i4 ; CHECK-NEXT: ret i4 [[CONV9]] ; @@ -405,8 +405,8 @@ define <4 x i32> @sadd_satv4i4(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @sadd_satv4i4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[ADD]], <4 x i32> splat (i32 15)) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[SPEC_STORE_SELECT]], <4 x i32> splat (i32 -16)) +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[ADD]], <4 x i32> splat (i32 -16)) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP0]], <4 x i32> splat (i32 15)) ; CHECK-NEXT: ret <4 x i32> [[SPEC_STORE_SELECT8]] ; entry: @@ -422,8 +422,8 @@ define <4 x i32> @ssub_satv4i4(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @ssub_satv4i4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADD:%.*]] = sub <4 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[ADD]], <4 x i32> splat (i32 15)) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[SPEC_STORE_SELECT]], <4 x i32> splat (i32 -16)) +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[ADD]], <4 x i32> splat (i32 -16)) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP0]], <4 x i32> splat (i32 15)) ; CHECK-NEXT: ret <4 x i32> [[SPEC_STORE_SELECT8]] ; entry: @@ -511,8 +511,8 @@ define i32 @sadd_sat32_extrause_3(i32 %a, i32 %b) { ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 ; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.smax.i64(i64 [[ADD]], i64 -2147483648) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP0]], i64 2147483647) ; CHECK-NEXT: [[CONV7:%.*]] = trunc nsw i64 [[SPEC_STORE_SELECT8]] to i32 ; CHECK-NEXT: call void @use64(i64 [[ADD]]) ; CHECK-NEXT: ret i32 [[CONV7]] @@ -536,8 +536,8 @@ define i32 @sadd_sat32_extrause_3_mm(i32 %a, i32 %b) { ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 ; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.smax.i64(i64 [[ADD]], i64 -2147483648) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP0]], i64 2147483647) ; CHECK-NEXT: [[CONV7:%.*]] = trunc nsw i64 [[SPEC_STORE_SELECT8]] to i32 ; CHECK-NEXT: call void @use64(i64 [[ADD]]) ; CHECK-NEXT: ret i32 [[CONV7]] @@ -559,8 +559,8 @@ define i32 @sadd_sat32_trunc(i32 %a, i32 %b) { ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 ; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 32767) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -32768) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.smax.i64(i64 [[ADD]], i64 -32768) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP0]], i64 32767) ; CHECK-NEXT: [[CONV7:%.*]] = trunc nsw i64 [[SPEC_STORE_SELECT8]] to i32 ; CHECK-NEXT: ret i32 [[CONV7]] ; @@ -601,8 +601,8 @@ define i8 @sadd_sat8_ext8(i8 %a, i16 %b) { ; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[A:%.*]] to i32 ; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[B:%.*]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[ADD]], i32 127) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -128) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.smax.i32(i32 [[ADD]], i32 -128) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP0]], i32 127) ; CHECK-NEXT: [[CONV7:%.*]] = trunc nsw i32 [[SPEC_STORE_SELECT8]] to i8 ; CHECK-NEXT: ret i8 [[CONV7]] ; diff --git a/llvm/test/Transforms/InstCombine/select-min-max.ll b/llvm/test/Transforms/InstCombine/select-min-max.ll index 0430fcd5ad370..99906620f8df2 100644 --- a/llvm/test/Transforms/InstCombine/select-min-max.ll +++ b/llvm/test/Transforms/InstCombine/select-min-max.ll @@ -216,8 +216,8 @@ define i32 @smax_smin(i32 %x) { define i32 @smin_smax(i32 %x) { ; CHECK-LABEL: @smin_smax( -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[X:%.*]], -2 -; CHECK-NEXT: [[S:%.*]] = select i1 [[TMP1]], i32 -1, i32 -2 +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[X:%.*]], -1 +; CHECK-NEXT: [[S:%.*]] = select i1 [[TMP1]], i32 -2, i32 -1 ; CHECK-NEXT: ret i32 [[S]] ; %m = call i32 @llvm.smin.i32(i32 %x, i32 -1) @@ -240,8 +240,8 @@ define i8 @umax_umin(i8 %x) { define i8 @umin_umax(i8 %x) { ; CHECK-LABEL: @umin_umax( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i8 [[X:%.*]], 126 -; CHECK-NEXT: [[S:%.*]] = select i1 [[TMP1]], i8 127, i8 126 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i8 [[X:%.*]], 127 +; CHECK-NEXT: [[S:%.*]] = select i1 [[TMP1]], i8 126, i8 127 ; CHECK-NEXT: ret i8 [[S]] ; %m = call i8 @llvm.umin.i8(i8 %x, i8 127) From 4a58071d87265dfccba72134b25cf4d1595d98c5 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Wed, 23 Apr 2025 10:33:36 +0200 Subject: [PATCH 032/245] [AMDGPU] Support block load/store for CSR (#130013) Add support for using the existing `SCRATCH_STORE_BLOCK` and `SCRATCH_LOAD_BLOCK` instructions for saving and restoring callee-saved VGPRs. This is controlled by a new subtarget feature, `block-vgpr-csr`. It does not include WWM registers - those will be saved and restored individually, just like before. This patch does not change the ABI. Use of this feature may lead to slightly increased stack usage, because the memory is not compacted if certain registers don't have to be transferred (this will happen in practice for calling conventions where the callee and caller saved registers are interleaved in groups of 8). However, if the registers at the end of the block of 32 don't have to be transferred, we don't need to use a whole 128-byte stack slot - we can trim some space off the end of the range. In order to implement this feature, we need to rely less on the target-independent code in the PrologEpilogInserter, so we override several new methods in `SIFrameLowering`. We also add new pseudos, `SI_BLOCK_SPILL_V1024_SAVE/RESTORE`. One peculiarity is that both the SI_BLOCK_V1024_RESTORE pseudo and the SCRATCH_LOAD_BLOCK instructions will have all the registers that are not transferred added as implicit uses. This is done in order to inform LiveRegUnits that those registers are not available before the restore (since we're not really restoring them - so we can't afford to scavenge them). Unfortunately, this trick doesn't work with the save, so before the save all the registers in the block will be unavailable (see the unit test). --- llvm/include/llvm/CodeGen/MachineFrameInfo.h | 1 + .../llvm/CodeGen/TargetFrameLowering.h | 17 + llvm/lib/CodeGen/PrologEpilogInserter.cpp | 35 +-- llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp | 35 +++ llvm/lib/Target/AMDGPU/AMDGPU.td | 8 + llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 37 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 204 ++++++++++++ llvm/lib/Target/AMDGPU/SIFrameLowering.h | 17 + llvm/lib/Target/AMDGPU/SIInstrInfo.h | 14 + llvm/lib/Target/AMDGPU/SIInstructions.td | 21 +- .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 32 ++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 73 ++++- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 16 + .../AMDGPU/pei-vgpr-block-spill-csr.mir | 294 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll | 93 ++++++ .../CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir | 47 +++ llvm/unittests/Target/AMDGPU/CMakeLists.txt | 1 + llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp | 160 ++++++++++ 19 files changed, 1066 insertions(+), 42 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir create mode 100644 llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir create mode 100644 llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h index 172c3e8c9a847..9d1b536d23331 100644 --- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h +++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h @@ -61,6 +61,7 @@ class CalleeSavedInfo { MCRegister getReg() const { return Reg; } int getFrameIdx() const { return FrameIdx; } MCRegister getDstReg() const { return DstReg; } + void setReg(MCRegister R) { Reg = R; } void setFrameIdx(int FI) { FrameIdx = FI; SpilledToReg = false; diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index cdbefb36c00c7..58b63f1769003 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -270,6 +270,14 @@ class TargetFrameLowering { return false; } + /// spillCalleeSavedRegister - Default implementation for spilling a single + /// callee saved register. + void spillCalleeSavedRegister(MachineBasicBlock &SaveBlock, + MachineBasicBlock::iterator MI, + const CalleeSavedInfo &CS, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) const; + /// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee /// saved registers and returns true if it isn't possible / profitable to do /// so by issuing a series of load instructions via loadRegToStackSlot(). @@ -284,6 +292,15 @@ class TargetFrameLowering { return false; } + // restoreCalleeSavedRegister - Default implementation for restoring a single + // callee saved register. Should be called in reverse order. Can insert + // multiple instructions. + void restoreCalleeSavedRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const CalleeSavedInfo &CS, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) const; + /// hasFP - Return true if the specified function should have a dedicated /// frame pointer register. For most targets this is true only if the function /// has variable sized allocas or if frame pointer elimination is disabled. diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 9b852c0fd49cf..0cd25c4feb8b9 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -476,8 +476,8 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, // Now that we know which registers need to be saved and restored, allocate // stack slots for them. for (auto &CS : CSI) { - // If the target has spilled this register to another register, we don't - // need to allocate a stack slot. + // If the target has spilled this register to another register or already + // handled it , we don't need to allocate a stack slot. if (CS.isSpilledToReg()) continue; @@ -597,25 +597,14 @@ static void updateLiveness(MachineFunction &MF) { static void insertCSRSaves(MachineBasicBlock &SaveBlock, ArrayRef CSI) { MachineFunction &MF = *SaveBlock.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MachineBasicBlock::iterator I = SaveBlock.begin(); if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { for (const CalleeSavedInfo &CS : CSI) { - // Insert the spill to the stack frame. - MCRegister Reg = CS.getReg(); - - if (CS.isSpilledToReg()) { - BuildMI(SaveBlock, I, DebugLoc(), - TII.get(TargetOpcode::COPY), CS.getDstReg()) - .addReg(Reg, getKillRegState(true)); - } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, - TRI, Register()); - } + TFI->spillCalleeSavedRegister(SaveBlock, I, CS, TII, TRI); } } } @@ -624,7 +613,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, static void insertCSRRestores(MachineBasicBlock &RestoreBlock, std::vector &CSI) { MachineFunction &MF = *RestoreBlock.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); @@ -634,19 +623,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) { for (const CalleeSavedInfo &CI : reverse(CSI)) { - MCRegister Reg = CI.getReg(); - if (CI.isSpilledToReg()) { - BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg) - .addReg(CI.getDstReg(), getKillRegState(true)); - } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, - TRI, Register()); - assert(I != RestoreBlock.begin() && - "loadRegFromStackSlot didn't insert any code!"); - // Insert in reverse order. loadRegFromStackSlot can insert - // multiple instructions. - } + TFI->restoreCalleeSavedRegister(RestoreBlock, I, CI, TII, TRI); } } } diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index be73b73c93989..70c3b2cbae9a6 100644 --- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -15,6 +15,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -182,3 +183,37 @@ TargetFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const { const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); return DwarfFrameBase{DwarfFrameBase::Register, {RI->getFrameRegister(MF).id()}}; } + +void TargetFrameLowering::spillCalleeSavedRegister( + MachineBasicBlock &SaveBlock, MachineBasicBlock::iterator MI, + const CalleeSavedInfo &CS, const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) const { + // Insert the spill to the stack frame. + MCRegister Reg = CS.getReg(); + + if (CS.isSpilledToReg()) { + BuildMI(SaveBlock, MI, DebugLoc(), TII->get(TargetOpcode::COPY), + CS.getDstReg()) + .addReg(Reg, getKillRegState(true)); + } else { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC, + TRI, Register()); + } +} + +void TargetFrameLowering::restoreCalleeSavedRegister( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const CalleeSavedInfo &CS, const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) const { + MCRegister Reg = CS.getReg(); + if (CS.isSpilledToReg()) { + BuildMI(MBB, MI, DebugLoc(), TII->get(TargetOpcode::COPY), Reg) + .addReg(CS.getDstReg(), getKillRegState(true)); + } else { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI, + Register()); + assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); + } +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b2098b41acb7e..d896589825fc7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1275,6 +1275,14 @@ def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32 "Use a block size of 32 for dynamic VGPR allocation (default is 16)" >; +// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and +// restoring the callee-saved registers. +def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr", + "UseBlockVGPROpsForCSR", + "true", + "Use block load/store for VGPR callee saved registers" +>; + def FeatureLshlAddU64Inst : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true", "Has v_lshl_add_u64 instruction">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 3d6b974d1f027..2dec16de940d1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -19,6 +19,7 @@ #include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCExpr.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/Constants.h" @@ -243,6 +244,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV, return AsmPrinter::lowerConstant(CV, BaseCV, Offset); } +static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII, + const TargetRegisterInfo *TRI, + const SIMachineFunctionInfo *MFI, + MCStreamer &OS) { + // The instruction will only transfer a subset of the registers in the block, + // based on the mask that is stored in m0. We could search for the instruction + // that sets m0, but most of the time we'll already have the mask stored in + // the machine function info. Try to use that. This assumes that we only use + // block loads/stores for CSR spills. + Register RegBlock = + TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst + : AMDGPU::OpName::vdata) + ->getReg(); + Register FirstRegInBlock = TRI->getSubReg(RegBlock, AMDGPU::sub0); + uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock); + + if (!Mask) + return; // Nothing to report + + SmallString<512> TransferredRegs; + for (unsigned I = 0; I < sizeof(Mask) * 8; ++I) { + if (Mask & (1 << I)) { + (llvm::Twine(" ") + TRI->getRegAsmName(FirstRegInBlock + I)) + .toVector(TransferredRegs); + } + } + + OS.emitRawComment(" transferring at most " + TransferredRegs); +} + void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { // FIXME: Enable feature predicate checks once all the test pass. // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(), @@ -331,6 +362,12 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } + if (isVerbose()) + if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode())) + emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(), + MF->getInfo(), + *OutStreamer); + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 7dd91c0775a48..fea17baa17722 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -262,6 +262,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasPointSampleAccel = false; bool RequiresCOV6 = false; + bool UseBlockVGPROpsForCSR = false; // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -1277,6 +1278,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool requiresCodeObjectV6() const { return RequiresCOV6; } + bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; } + bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 9c737b4f3e378..0c1cd9ceddb02 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1739,6 +1739,105 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, } } +static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, + const GCNSubtarget &ST, + std::vector &CSI, + unsigned &MinCSFrameIndex, + unsigned &MaxCSFrameIndex) { + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + assert(std::is_sorted(CSI.begin(), CSI.end(), + [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) { + return A.getReg() < B.getReg(); + }) && + "Callee saved registers not sorted"); + + auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) { + return !CSI.isSpilledToReg() && + TRI->getPhysRegBaseClass(CSI.getReg()) == &AMDGPU::VGPR_32RegClass && + !FuncInfo->isWWMReservedRegister(CSI.getReg()); + }; + + auto CSEnd = CSI.end(); + for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) { + Register Reg = CSIt->getReg(); + if (!CanUseBlockOps(*CSIt)) + continue; + + // Find all the regs that will fit in a 32-bit mask starting at the current + // reg and build said mask. It should have 1 for every register that's + // included, with the current register as the least significant bit. + uint32_t Mask = 1; + CSEnd = std::remove_if( + CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool { + if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) { + Mask |= 1 << (CSI.getReg() - Reg); + return true; + } else { + return false; + } + }); + + const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF); + Register RegBlock = + TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass); + if (!RegBlock) { + // We couldn't find a super register for the block. This can happen if + // the register we started with is too high (e.g. v232 if the maximum is + // v255). We therefore try to get the last register block and figure out + // the mask from there. + Register LastBlockStart = + AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32); + RegBlock = + TRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass); + assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) && + "Couldn't find super register"); + int RegDelta = Reg - LastBlockStart; + assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta && + "Bad shift amount"); + Mask <<= RegDelta; + } + + FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask); + + // The stack objects can be a bit smaller than the register block if we know + // some of the high bits of Mask are 0. This may happen often with calling + // conventions where the caller and callee-saved VGPRs are interleaved at + // a small boundary (e.g. 8 or 16). + int UnusedBits = llvm::countl_zero(Mask); + unsigned BlockSize = TRI->getSpillSize(*BlockRegClass) - UnusedBits * 4; + int FrameIdx = + MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass), + /*isSpillSlot=*/true); + if ((unsigned)FrameIdx < MinCSFrameIndex) + MinCSFrameIndex = FrameIdx; + if ((unsigned)FrameIdx > MaxCSFrameIndex) + MaxCSFrameIndex = FrameIdx; + + CSIt->setFrameIdx(FrameIdx); + CSIt->setReg(RegBlock); + } + CSI.erase(CSEnd, CSI.end()); +} + +bool SIFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI, unsigned &MinCSFrameIndex, + unsigned &MaxCSFrameIndex) const { + if (CSI.empty()) + return true; // Early exit if no callee saved registers are modified! + + const GCNSubtarget &ST = MF.getSubtarget(); + bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR(); + + if (UseVGPRBlocks) + assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex); + + return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks; +} + bool SIFrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector &CSI) const { @@ -1808,6 +1907,111 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( return true; } +bool SIFrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + ArrayRef CSI, const TargetRegisterInfo *TRI) const { + MachineFunction *MF = MBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + if (!ST.useVGPRBlockOpsForCSR()) + return false; + + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + SIMachineFunctionInfo *FuncInfo = MF->getInfo(); + + const TargetRegisterClass *BlockRegClass = + static_cast(TRI)->getRegClassForBlockOp(*MF); + for (const CalleeSavedInfo &CS : CSI) { + Register Reg = CS.getReg(); + if (!BlockRegClass->contains(Reg) || + !FuncInfo->hasMaskForVGPRBlockOps(Reg)) { + spillCalleeSavedRegister(MBB, MI, CS, TII, TRI); + continue; + } + + // Build a scratch block store. + uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg); + int FrameIndex = CS.getFrameIdx(); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(*MF, FrameIndex); + MachineMemOperand *MMO = + MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + FrameInfo.getObjectSize(FrameIndex), + FrameInfo.getObjectAlign(FrameIndex)); + + BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE)) + .addReg(Reg, getKillRegState(false)) + .addFrameIndex(FrameIndex) + .addReg(MFI->getStackPtrOffsetReg()) + .addImm(0) + .addImm(Mask) + .addMemOperand(MMO); + + FuncInfo->setHasSpilledVGPRs(); + + // Add the register to the liveins. This is necessary because if any of the + // VGPRs in the register block is reserved (e.g. if it's a WWM register), + // then the whole block will be marked as reserved and `updateLiveness` will + // skip it. + MBB.addLiveIn(Reg); + } + MBB.sortUniqueLiveIns(); + + return true; +} + +bool SIFrameLowering::restoreCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + MutableArrayRef CSI, const TargetRegisterInfo *TRI) const { + MachineFunction *MF = MBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + if (!ST.useVGPRBlockOpsForCSR()) + return false; + + SIMachineFunctionInfo *FuncInfo = MF->getInfo(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *SITRI = static_cast(TRI); + const TargetRegisterClass *BlockRegClass = SITRI->getRegClassForBlockOp(*MF); + for (const CalleeSavedInfo &CS : reverse(CSI)) { + Register Reg = CS.getReg(); + if (!BlockRegClass->contains(Reg) || + !FuncInfo->hasMaskForVGPRBlockOps(Reg)) { + restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI); + continue; + } + + // Build a scratch block load. + uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg); + int FrameIndex = CS.getFrameIdx(); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(*MF, FrameIndex); + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex), + MFI.getObjectAlign(FrameIndex)); + + auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg) + .addFrameIndex(FrameIndex) + .addReg(FuncInfo->getStackPtrOffsetReg()) + .addImm(0) + .addImm(Mask) + .addMemOperand(MMO); + SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg); + + // Add the register to the liveins. This is necessary because if any of the + // VGPRs in the register block is reserved (e.g. if it's a WWM register), + // then the whole block will be marked as reserved and `updateLiveness` will + // skip it. + MBB.addLiveIn(Reg); + } + + MBB.sortUniqueLiveIns(); + return true; +} + MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 9dac4bc8951e5..a72772987262e 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -49,6 +49,23 @@ class SIFrameLowering final : public AMDGPUFrameLowering { const TargetRegisterInfo *TRI, std::vector &CSI) const override; + bool assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI, + unsigned &MinCSFrameIndex, + unsigned &MaxCSFrameIndex) const override; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + ArrayRef CSI, + const TargetRegisterInfo *TRI) const override; + + bool + restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + MutableArrayRef CSI, + const TargetRegisterInfo *TRI) const override; + bool allocateScavengingFrameIndexesNearIncomingSP( const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index a3a54659d299a..4b97f58ce92b9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -665,6 +665,20 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + static bool isBlockLoadStore(uint16_t Opcode) { + switch (Opcode) { + case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: + case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: + case AMDGPU::SCRATCH_STORE_BLOCK_SADDR: + case AMDGPU::SCRATCH_LOAD_BLOCK_SADDR: + case AMDGPU::SCRATCH_STORE_BLOCK_SVS: + case AMDGPU::SCRATCH_LOAD_BLOCK_SVS: + return true; + default: + return false; + } + } + static bool isEXP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::EXP; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index ed45cf8851146..a144ae2104da6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1025,13 +1025,16 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst), // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register // needs to be used and an extra instruction to move between VGPR and AGPR. // UsesTmp adds to the total size of an expanded spill in this case. -multiclass SI_SPILL_VGPR { +multiclass SI_SPILL_VGPR { let UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM] in { def _SAVE : VPseudoInstSI < (outs), - (ins vgpr_class:$vdata, i32imm:$vaddr, - SReg_32:$soffset, i32imm:$offset)> { + !con( + (ins vgpr_class:$vdata, i32imm:$vaddr, + SReg_32:$soffset, i32imm:$offset), + !if(HasMask, (ins SReg_32:$mask), (ins)))> { let mayStore = 1; let mayLoad = 0; // (2 * 4) + (8 * num_subregs) bytes maximum @@ -1042,8 +1045,10 @@ multiclass SI_SPILL_VGPR { def _RESTORE : VPseudoInstSI < (outs vgpr_class:$vdata), - (ins i32imm:$vaddr, - SReg_32:$soffset, i32imm:$offset)> { + !con( + (ins i32imm:$vaddr, + SReg_32:$soffset, i32imm:$offset), + !if(HasMask, (ins SReg_32:$mask), (ins)))> { let mayStore = 0; let mayLoad = 1; @@ -1071,6 +1076,12 @@ defm SI_SPILL_V384 : SI_SPILL_VGPR ; defm SI_SPILL_V512 : SI_SPILL_VGPR ; defm SI_SPILL_V1024 : SI_SPILL_VGPR ; +let Defs = [M0] in { + // Spills a block of 32 VGPRs. M0 will contain a mask describing which + // registers in the block need to be transferred. + defm SI_BLOCK_SPILL_V1024 : SI_SPILL_VGPR ; +} + defm SI_SPILL_A32 : SI_SPILL_VGPR ; defm SI_SPILL_A64 : SI_SPILL_VGPR ; defm SI_SPILL_A96 : SI_SPILL_VGPR ; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index a60409b5a7e09..9c1014a0e5cfe 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -388,6 +388,16 @@ class PrologEpilogSGPRSaveRestoreInfo { SGPRSaveKind getKind() const { return Kind; } }; +const MCRegister FirstVGPRBlock = AMDGPU::VReg_1024RegClass.getRegister(0); + +struct VGPRBlock2IndexFunctor { + using argument_type = Register; + unsigned operator()(Register Reg) const { + assert(AMDGPU::VReg_1024RegClass.contains(Reg) && "Expecting a VGPR block"); + return Reg - FirstVGPRBlock; + } +}; + /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo final : public AMDGPUMachineFunction, @@ -574,6 +584,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, // frame, so save it here and add it to the RegScavenger later. std::optional ScavengeFI; + // Map each VGPR CSR to the mask needed to save and restore it using block + // load/store instructions. Only used if the subtarget feature for VGPR block + // load/store is enabled. + IndexedMap MaskForVGPRBlockOps; + private: Register VGPRForAGPRCopy; @@ -594,6 +609,19 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const; + void setMaskForVGPRBlockOps(Register RegisterBlock, uint32_t Mask) { + MaskForVGPRBlockOps.grow(RegisterBlock); + MaskForVGPRBlockOps[RegisterBlock] = Mask; + } + + uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const { + return MaskForVGPRBlockOps[RegisterBlock]; + } + + bool hasMaskForVGPRBlockOps(Register RegisterBlock) const { + return MaskForVGPRBlockOps.inBounds(RegisterBlock); + } + public: SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI) = default; SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI); @@ -634,6 +662,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, const WWMSpillsMap &getWWMSpills() const { return WWMSpills; } const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; } + bool isWWMReservedRegister(Register Reg) const { + return WWMReservedRegs.contains(Reg); + } + ArrayRef getPrologEpilogSGPRSpills() const { assert(is_sorted(PrologEpilogSGPRSpills, llvm::less_first())); return PrologEpilogSGPRSpills; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index c1ac9491b2363..0e4cd12e57d77 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1177,9 +1177,18 @@ SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { return RC; } -static unsigned getNumSubRegsForSpillOp(unsigned Op) { +static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, + const SIInstrInfo *TII) { + unsigned Op = MI.getOpcode(); switch (Op) { + case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: + case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: + // FIXME: This assumes the mask is statically known and not computed at + // runtime. However, some ABIs may want to compute the mask dynamically and + // this will need to be updated. + return llvm::popcount( + (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm()); case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_V1024_SAVE: @@ -1520,6 +1529,10 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, bool UseST = !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr); + // Handle block load/store first. + if (TII->isBlockLoadStore(LoadStoreOp)) + return LoadStoreOp; + switch (EltSize) { case 4: LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR @@ -1564,6 +1577,7 @@ void SIRegisterInfo::buildSpillLoadStore( const MCInstrDesc *Desc = &TII->get(LoadStoreOp); bool IsStore = Desc->mayStore(); bool IsFlat = TII->isFLATScratch(LoadStoreOp); + bool IsBlock = TII->isBlockLoadStore(LoadStoreOp); bool CanClobberSCC = false; bool Scavenged = false; @@ -1576,7 +1590,10 @@ void SIRegisterInfo::buildSpillLoadStore( // Always use 4 byte operations for AGPRs because we need to scavenge // a temporary VGPR. - unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; + // If we're using a block operation, the element should be the whole block. + unsigned EltSize = IsBlock ? RegWidth + : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) + : 4u; unsigned NumSubRegs = RegWidth / EltSize; unsigned Size = NumSubRegs * EltSize; unsigned RemSize = RegWidth - Size; @@ -1731,6 +1748,7 @@ void SIRegisterInfo::buildSpillLoadStore( LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); } else { assert(ST.hasFlatScratchSTMode()); + assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST"); LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); } @@ -1939,6 +1957,14 @@ void SIRegisterInfo::buildSpillLoadStore( MIB.addReg(SubReg, RegState::Implicit); MIB->tieOperands(0, MIB->getNumOperands() - 1); } + + // If we're building a block load, we should add artificial uses for the + // CSR VGPRs that are *not* being transferred. This is because liveness + // analysis is not aware of the mask, so we need to somehow inform it that + // those registers are not available before the load and they should not be + // scavenged. + if (!IsStore && TII->isBlockLoadStore(LoadStoreOp)) + addImplicitUsesForBlockCSRLoad(MIB, ValueReg); } if (ScratchOffsetRegDelta != 0) { @@ -1949,6 +1975,18 @@ void SIRegisterInfo::buildSpillLoadStore( } } +void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, + Register BlockReg) const { + const MachineFunction *MF = MIB->getParent()->getParent(); + const SIMachineFunctionInfo *FuncInfo = MF->getInfo(); + uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg); + Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0); + for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset) + if (!(Mask & (1 << RegOffset)) && + isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF)) + MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit); +} + void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill) const { @@ -2367,6 +2405,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } // VGPR register spill + case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: { + // Put mask into M0. + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), + AMDGPU::M0) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask)); + LLVM_FALLTHROUGH; + } case AMDGPU::SI_SPILL_V1024_SAVE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V384_SAVE: @@ -2427,8 +2472,10 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16; } else { - Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR - : AMDGPU::BUFFER_STORE_DWORD_OFFSET; + Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE + ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR + : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; } auto *MBB = MI->getParent(); @@ -2441,13 +2488,20 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); - MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); + MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(*MI, TII)); if (IsWWMRegSpill) TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); MI->eraseFromParent(); return true; } + case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: { + // Put mask into M0. + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), + AMDGPU::M0) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask)); + LLVM_FALLTHROUGH; + } case AMDGPU::SI_SPILL_V16_RESTORE: case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: @@ -2503,14 +2557,17 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16; } else { - Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR - : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE + ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR + : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; } + auto *MBB = MI->getParent(); bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); if (IsWWMRegSpill) { TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), - RS->isRegUsed(AMDGPU::SCC)); + RS->isRegUsed(AMDGPU::SCC)); } buildSpillLoadStore( diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index f3068963fd10f..a4b135d5e0b59 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -26,6 +26,7 @@ namespace llvm { class GCNSubtarget; class LiveIntervals; class LiveRegUnits; +class MachineInstrBuilder; class RegisterBank; struct SGPRSpillBuilder; @@ -115,6 +116,16 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { return 100; } + // When building a block VGPR load, we only really transfer a subset of the + // registers in the block, based on a mask. Liveness analysis is not aware of + // the mask, so it might consider that any register in the block is available + // before the load and may therefore be scavenged. This is not ok for CSRs + // that are not clobbered, since the caller will expect them to be preserved. + // This method will add artificial implicit uses for those registers on the + // load instruction, so liveness analysis knows they're unavailable. + void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, + Register BlockReg) const; + const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override; @@ -158,6 +169,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override; + const TargetRegisterClass * + getRegClassForBlockOp(const MachineFunction &MF) const { + return &AMDGPU::VReg_1024RegClass; + } + void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill = true) const; diff --git a/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir b/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir new file mode 100644 index 0000000000000..086390f575fbb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir @@ -0,0 +1,294 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+block-vgpr-csr,+wavefrontsize32,-wavefrontsize64 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,W32 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+block-vgpr-csr,-wavefrontsize32,+wavefrontsize64 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,W64 + +--- | + define void @one_block() { ret void } + define void @one_block_csr_only() { ret void } + define void @multiple_blocks() { ret void } + define void @reg_tuples() { ret void } + define void @locals() { ret void } + define void @other_regs() { ret void } + define amdgpu_kernel void @entry_func() { ret void } + define void @multiple_basic_blocks() { ret void } +... + +# Block load/store v42 and v45. The mask should be 0x9. + +--- +name: one_block +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +body: | + bb.0: + liveins: $sgpr30_sgpr31 + ; CHECK-LABEL: name: one_block + ; CHECK: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $m0 = S_MOV_B32 9 + ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45 + ; CHECK-NEXT: $m0 = S_MOV_B32 9 + ; CHECK-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45 + S_SETPC_B64_return $sgpr30_sgpr31 +... + +# Block load/store v40-47 and v56-63 (v48-55 and v64-71 are caller-saved). The +# mask should be 0x00FF00FF. + +--- +name: one_block_csr_only +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +body: | + bb.0: + liveins: $sgpr30_sgpr31 + ; CHECK-LABEL: name: one_block_csr_only + ; CHECK: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $m0 = S_MOV_B32 16711935 + ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66 + ; CHECK-NEXT: $m0 = S_MOV_B32 16711935 + ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66 + S_SETPC_B64_return $sgpr30_sgpr31 +... + +# Block load/store to/from different blocks. +# Note the mask for storing v232, which is 0x100 because we have to start the +# block at v224 (since the upper limit is 255). For the same reason, the first +# stack slot will be 36 bytes long (the first 32 will be empty, since the memory +# will not get compacted). The second slot, which will hold registers v104 and +# v110, will be 28 bytes long, and finally the third, holding registers v40 and +# v41, will be 8 bytes long. +--- +name: multiple_blocks +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +body: | + bb.0: + liveins: $sgpr30_sgpr31 + ; CHECK-LABEL: name: multiple_blocks + ; CHECK: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $m0 = S_MOV_B32 3 + ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 64, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: $m0 = S_MOV_B32 65 + ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: $m0 = S_MOV_B32 256 + ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr104, implicit-def $vgpr110, implicit-def $vgpr232 + ; CHECK-NEXT: $m0 = S_MOV_B32 256 + ; CHECK-NEXT: $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr233, implicit $vgpr234, implicit $vgpr235, implicit $vgpr236, implicit $vgpr237, implicit $vgpr238, implicit $vgpr239, implicit $vgpr248, implicit $vgpr249, implicit $vgpr250, implicit $vgpr251, implicit $vgpr252, implicit $vgpr253, implicit $vgpr254, implicit $vgpr255 :: (load (s1024) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: $m0 = S_MOV_B32 65 + ; CHECK-NEXT: $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 36, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr105, implicit $vgpr106, implicit $vgpr107, implicit $vgpr108, implicit $vgpr109, implicit $vgpr111, implicit $vgpr120, implicit $vgpr121, implicit $vgpr122, implicit $vgpr123, implicit $vgpr124, implicit $vgpr125, implicit $vgpr126, implicit $vgpr127 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: $m0 = S_MOV_B32 3 + ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 64, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr104, implicit-def $vgpr110, implicit-def $vgpr232 + S_SETPC_B64_return $sgpr30_sgpr31 +... + +# Make sure we handle register tuples correctly, even when they're straddling +# the boundary between blocks. The first mask should be 0x00000007 (the bottom +# 2 registers from the second tuple are not callee saves), the second +# 0x00000003. + +--- +name: reg_tuples +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +body: | + bb.0: + liveins: $sgpr30_sgpr31 + ; CHECK-LABEL: name: reg_tuples + ; CHECK: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $m0 = S_MOV_B32 7 + ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: $m0 = S_MOV_B32 3 + ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42, implicit-def $vgpr70_vgpr71_vgpr72_vgpr73 + ; CHECK-NEXT: $m0 = S_MOV_B32 3 + ; CHECK-NEXT: $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr74, implicit $vgpr75, implicit $vgpr76, implicit $vgpr77, implicit $vgpr78, implicit $vgpr79, implicit $vgpr88, implicit $vgpr89, implicit $vgpr90, implicit $vgpr91, implicit $vgpr92, implicit $vgpr93, implicit $vgpr94, implicit $vgpr95 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: $m0 = S_MOV_B32 7 + ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42, implicit-def $vgpr70_vgpr71_vgpr72_vgpr73 + S_SETPC_B64_return $sgpr30_sgpr31 +... + +# Make sure we don't overwrite any stack variables. + +--- +name: locals +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +stack: +- { id: 0, type: default, offset: 0, size: 12, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +- { id: 1, type: default, offset: 12, size: 20, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +body: | + bb.0: + liveins: $sgpr30_sgpr31, $vgpr48 + ; CHECK-LABEL: name: locals + ; CHECK: liveins: $vgpr48, $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $m0 = S_MOV_B32 1 + ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40 + ; CHECK-NEXT: $m0 = S_MOV_B32 1 + ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr41, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + S_NOP 0, implicit-def $vgpr40 + S_SETPC_B64_return $sgpr30_sgpr31 +... + +# Make sure we don't break SGPR or WWM handling, and also that we don't +# block-spill WWM VGPRs that have already been spilled (the mask for the block +# load/store should be 0x9 because we don't want to include v41 or v42). +# Use all VGPRs up to v40, so the WWM registers v41 and v42 and the VGPR used +# for SGPR spills remain within the block. + +--- +name: other_regs +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 + wwmReservedRegs: + - '$vgpr41' + - '$vgpr42' +body: | + bb.0: + liveins: $sgpr30_sgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40 + ; W32-LABEL: name: other_regs + ; W32: liveins: $sgpr48, $sgpr30_sgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 + ; W32-NEXT: {{ $}} + ; W32-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; W32-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; W32-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; W32-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) + ; W32-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 + ; W32-NEXT: $m0 = S_MOV_B32 9 + ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.4, align 4, addrspace 5) + ; W32-NEXT: $vgpr44 = SI_SPILL_S32_TO_VGPR $sgpr48, 0, $vgpr44 + ; W32-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr43, implicit-def $sgpr22, implicit-def $sgpr48, implicit-def $m0, implicit-def $exec + ; W32-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40 + ; W32-NEXT: $sgpr48 = SI_RESTORE_S32_FROM_VGPR $vgpr44, 0 + ; W32-NEXT: $m0 = S_MOV_B32 9 + ; W32-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr41, implicit $vgpr42, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.4, align 4, addrspace 5) + ; W32-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; W32-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; W32-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; W32-NEXT: $vgpr44 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) + ; W32-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 + ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + ; + ; W64-LABEL: name: other_regs + ; W64: liveins: $sgpr48, $sgpr30_sgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 + ; W64-NEXT: {{ $}} + ; W64-NEXT: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; W64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; W64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; W64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) + ; W64-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 + ; W64-NEXT: $m0 = S_MOV_B32 9 + ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.4, align 4, addrspace 5) + ; W64-NEXT: $vgpr44 = SI_SPILL_S32_TO_VGPR $sgpr48, 0, $vgpr44 + ; W64-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr43, implicit-def $sgpr22, implicit-def $sgpr48, implicit-def $m0, implicit-def $exec + ; W64-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40 + ; W64-NEXT: $sgpr48 = SI_RESTORE_S32_FROM_VGPR $vgpr44, 0 + ; W64-NEXT: $m0 = S_MOV_B32 9 + ; W64-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr41, implicit $vgpr42, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.4, align 4, addrspace 5) + ; W64-NEXT: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; W64-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; W64-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; W64-NEXT: $vgpr44 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) + ; W64-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 + ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr43, implicit-def $sgpr22, implicit-def $sgpr48, implicit-def $m0, implicit-def $exec + S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40 + + S_SETPC_B64_return $sgpr30_sgpr31 +... + +# Make sure we don't break anything for entry functions. + +--- +name: entry_func +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +body: | + bb.0: + liveins: $sgpr30_sgpr31 + ; CHECK-LABEL: name: entry_func + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45, implicit-def $vgpr51 + ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45, implicit-def $vgpr51 + S_SETPC_B64_return $sgpr30_sgpr31 +... + +--- +name: multiple_basic_blocks +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +body: | + ; CHECK-LABEL: name: multiple_basic_blocks + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr44, $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $m0 = S_MOV_B32 11 + ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45 + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr44, $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr43, implicit $vgpr44 + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $m0 = S_MOV_B32 11 + ; CHECK-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + bb.0: + liveins: $sgpr30_sgpr31, $vgpr44 + S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45 + S_BRANCH %bb.1 + + bb.1: + liveins: $sgpr30_sgpr31, $vgpr44 + S_NOP 0, implicit-def $vgpr43, implicit $vgpr44 + S_BRANCH %bb.2 + + bb.2: + liveins: $sgpr30_sgpr31 + S_SETPC_B64_return $sgpr30_sgpr31 +... diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll new file mode 100644 index 0000000000000..91ad9742f7b28 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+block-vgpr-csr < %s | FileCheck -check-prefixes=CHECK,GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+block-vgpr-csr < %s | FileCheck -check-prefixes=CHECK,DAGISEL %s + +define i32 @non_entry_func(i32 %x) { +; CHECK-LABEL: non_entry_func: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_xor_saveexec_b32 s0, -1 +; CHECK-NEXT: scratch_store_b32 off, v2, s32 offset:100 ; 4-byte Folded Spill +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s0 +; CHECK-NEXT: s_mov_b32 m0, 0x110003 +; CHECK-NEXT: v_writelane_b32 v2, s48, 0 +; CHECK-NEXT: ; transferring at most v40 v41 v56 v60 ; 128-byte Folded Spill +; CHECK-NEXT: scratch_store_block off, v[40:71], s32 offset:4 +; CHECK-NEXT: s_mov_b32 m0, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: ; transferring at most v120 ; 128-byte Folded Spill +; CHECK-NEXT: scratch_store_block off, v[120:151], s32 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: s_nop +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; transferring at most v120 ; 128-byte Folded Reload +; CHECK-NEXT: scratch_load_block v[120:151], off, s32 +; CHECK-NEXT: s_mov_b32 m0, 0x110003 +; CHECK-NEXT: scratch_store_b32 off, v1, s32 offset:88 +; CHECK-NEXT: ; transferring at most v40 v41 v56 v60 ; 128-byte Folded Reload +; CHECK-NEXT: scratch_load_block v[40:71], off, s32 offset:4 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_readlane_b32 s48, v2, 0 +; CHECK-NEXT: s_xor_saveexec_b32 s0, -1 +; CHECK-NEXT: scratch_load_b32 v2, off, s32 offset:100 ; 4-byte Folded Reload +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s0 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %local = alloca i32, i32 3, addrspace(5) + store i32 %x, ptr addrspace(5) %local + call void asm "s_nop", "~{v0},~{v8},~{v40},~{v41},~{v49},~{v52},~{v56},~{v60},~{v120},~{s0},~{s48}"() + ret i32 %x +} + +define amdgpu_kernel void @entry_func(i32 %x) { +; GISEL-LABEL: entry_func: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GISEL-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GISEL-NEXT: v_mov_b32_e32 v31, v0 +; GISEL-NEXT: s_mov_b64 s[12:13], s[0:1] +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: s_nop +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: s_add_co_u32 s8, s4, 4 +; GISEL-NEXT: s_mov_b32 s0, non_entry_func@abs32@lo +; GISEL-NEXT: s_mov_b32 s1, non_entry_func@abs32@hi +; GISEL-NEXT: s_add_co_ci_u32 s9, s5, 0 +; GISEL-NEXT: s_mov_b64 s[4:5], s[12:13] +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL-NEXT: s_endpgm +; +; DAGISEL-LABEL: entry_func: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_load_b32 s12, s[4:5], 0x0 +; DAGISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; DAGISEL-NEXT: v_mov_b32_e32 v31, v0 +; DAGISEL-NEXT: s_mov_b64 s[6:7], s[0:1] +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: s_nop +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: s_add_nc_u64 s[8:9], s[4:5], 4 +; DAGISEL-NEXT: s_mov_b32 s1, non_entry_func@abs32@hi +; DAGISEL-NEXT: s_mov_b32 s0, non_entry_func@abs32@lo +; DAGISEL-NEXT: s_mov_b64 s[4:5], s[6:7] +; DAGISEL-NEXT: s_mov_b64 s[6:7], s[2:3] +; DAGISEL-NEXT: s_mov_b32 s32, 0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: v_mov_b32_e32 v0, s12 +; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL-NEXT: s_endpgm + call void asm "s_nop", "~{v0},~{v8},~{v40},~{v41},~{v49},~{v52},~{v56},~{v60},~{v120},~{s0},~{s48}"() + %res = call i32 @non_entry_func(i32 %x) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir b/llvm/test/CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir new file mode 100644 index 0000000000000..6ef1c33ed18f6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir @@ -0,0 +1,47 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+block-vgpr-csr -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s + +# The spill slot for the VGPR block needs to hold v40 and v43, so it needs to be +# 16 bytes large. +--- +name: locals +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +stack: +- { id: 0, type: default, offset: 0, size: 12, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +- { id: 1, type: default, offset: 12, size: 20, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +# CHECK-LABEL: name: locals +# CHECK: frameInfo: +# CHECK: stackSize: 52 +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: '', type: default, offset: 16, size: 12, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: local-offset: 0, debug-info-variable: '', debug-info-expression: '', +# CHECK-NEXT: debug-info-location: '' } +# CHECK-NEXT: - { id: 1, name: '', type: default, offset: 28, size: 20, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: local-offset: 0, debug-info-variable: '', debug-info-expression: '', +# CHECK-NEXT: debug-info-location: '' } +# CHECK-NEXT: - { id: 2, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71', +# CHECK-NEXT: callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', +# CHECK-NEXT: debug-info-location: '' } +# CHECK-NEXT: - { id: 3, name: '', type: default, offset: 48, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK: machineFunctionInfo: +# CHECK: hasSpilledVGPRs: true +body: | + bb.0: + liveins: $sgpr30_sgpr31, $vgpr48 + SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr43 + S_SETPC_B64_return $sgpr30_sgpr31 +... diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt index 6d6f17883a07e..d6cbaf3f3fb5d 100644 --- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt +++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt @@ -23,5 +23,6 @@ add_llvm_target_unittest(AMDGPUTests CSETest.cpp DwarfRegMappings.cpp ExecMayBeModifiedBeforeAnyUse.cpp + LiveRegUnits.cpp PALMetadata.cpp ) diff --git a/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp b/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp new file mode 100644 index 0000000000000..95266dc853bfd --- /dev/null +++ b/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp @@ -0,0 +1,160 @@ +//===--------- llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPUUnitTests.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/MIRParser/MIRParser.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/TargetParser/TargetParser.h" +#include "gtest/gtest.h" + +#include "AMDGPUGenSubtargetInfo.inc" + +using namespace llvm; + +// FIXME: Consolidate parseMIR and other common helpers (this one is copied from +// unittests/MIR/MachineMetadata.cpp). +std::unique_ptr parseMIR(LLVMContext &Context, const TargetMachine &TM, + StringRef MIRCode, const char *FnName, + MachineModuleInfo &MMI) { + SMDiagnostic Diagnostic; + std::unique_ptr MBuffer = MemoryBuffer::getMemBuffer(MIRCode); + auto MIR = createMIRParser(std::move(MBuffer), Context); + if (!MIR) + return nullptr; + + std::unique_ptr Mod = MIR->parseIRModule(); + if (!Mod) + return nullptr; + + Mod->setDataLayout(TM.createDataLayout()); + + if (MIR->parseMachineFunctions(*Mod, MMI)) { + return nullptr; + } + + return Mod; +} + +TEST(AMDGPULiveRegUnits, TestVGPRBlockLoadStore) { + auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx1200", ""); + ASSERT_TRUE(TM) << "No target machine"; + + GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM); + + // Add a very simple MIR snippet that saves and restores a block of VGPRs. The + // body of the function, represented by a S_NOP, clobbers one CSR (v42) and + // one caller-saved register (v49), and reads one CSR (v61) and one + // callee-saved register (v53). + StringRef MIRString = R"MIR( +name: vgpr-block-insts +stack: +- { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4, + stack-id: default, callee-saved-register: '$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71', + callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +body: | + bb.0: + liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 + + $m0 = S_MOV_B32 1 + SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5) + S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr49, implicit $vgpr53, implicit $vgpr61 + $m0 = S_MOV_B32 1 + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: (load (s1024) from %stack.0, align 4, addrspace 5) + S_SETPC_B64_return $sgpr30_sgpr31 +... +)MIR"; + + LLVMContext Context; + MachineModuleInfo MMI(TM.get()); + auto M = parseMIR(Context, *TM, MIRString, "vgpr-block-insts", MMI); + + auto *MF = MMI.getMachineFunction(*M->getFunction("vgpr-block-insts")); + auto *MBB = MF->getBlockNumbered(0); + + auto MIt = --MBB->instr_end(); + + LiveRegUnits LiveUnits; + LiveUnits.init(*ST.getRegisterInfo()); + + LiveUnits.addLiveOuts(*MBB); + LiveUnits.stepBackward(*MIt); + + // Right after the restore, we expect all the CSRs to be unavailable. + // Check v40-v88 (callee and caller saved regs interleaved in blocks of 8). + for (unsigned I = 0; I < 8; ++I) { + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I; + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR56 + I)) << "I = " << I; + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR64 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR72 + I)) << "I = " << I; + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR80 + I)) << "I = " << I; + } + + --MIt; + LiveUnits.stepBackward(*MIt); + + // Right before the restore, we expect the CSRs that are actually transferred + // (in this case v42) to be available. Everything else should be the same as + // before. + for (unsigned I = 0; I < 8; ++I) { + if (I == 2) + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I; + else + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I; + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR56 + I)) << "I = " << I; + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR64 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR72 + I)) << "I = " << I; + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR80 + I)) << "I = " << I; + } + + --MIt; // Set m0 has no effect on VGPRs. + LiveUnits.stepBackward(*MIt); + --MIt; // S_NOP. + LiveUnits.stepBackward(*MIt); + + // The S_NOP uses one of the caller-saved registers (v53), so that won't be + // available anymore. + for (unsigned I = 0; I < 8; ++I) { + if (I == 2) + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I; + else + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I; + if (I == 5) + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I; + else + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR56 + I)) << "I = " << I; + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR64 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR72 + I)) << "I = " << I; + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR80 + I)) << "I = " << I; + } + + --MIt; + LiveUnits.stepBackward(*MIt); + + // Right before the save, all the VGPRs in the block that we're saving will be + // unavailable, regardless of whether they're callee or caller saved. This is + // unfortunate and should probably be fixed somehow. + // VGPRs outside the block will only be unavailable if they're callee saved. + for (unsigned I = 0; I < 8; ++I) { + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR56 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR64 + I)) << "I = " << I; + EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR72 + I)) << "I = " << I; + EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR80 + I)) << "I = " << I; + } +} From 48585caf727004678617dc34fa50383c3f4eb2de Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 23 Apr 2025 10:51:55 +0200 Subject: [PATCH 033/245] InstCombine: Avoid counting uses of constants (#136566) Logically it does not matter; getFreelyInvertedImpl doesn't depend on the value for the m_ImmConstant case. This use count logic should probably sink into getFreelyInvertedImpl, every use of this appears to just be a hasOneUse or hasNUse count, so this could change to just be a use count threshold. --- .../InstCombine/InstCombineCompares.cpp | 4 +- llvm/test/Transforms/InstCombine/icmp.ll | 93 +++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 55afe1258159a..b7b0bb7361359 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -5034,8 +5034,8 @@ static Instruction *foldICmpOrXX(ICmpInst &I, const SimplifyQuery &Q, if (ICmpInst::isEquality(Pred) && Op0->hasOneUse()) { // icmp (X | Y) eq/ne Y --> (X & ~Y) eq/ne 0 if Y is freely invertible - if (Value *NotOp1 = - IC.getFreelyInverted(Op1, !Op1->hasNUsesOrMore(3), &IC.Builder)) + if (Value *NotOp1 = IC.getFreelyInverted( + Op1, !isa(Op1) && !Op1->hasNUsesOrMore(3), &IC.Builder)) return new ICmpInst(Pred, IC.Builder.CreateAnd(A, NotOp1), Constant::getNullValue(Op1->getType())); // icmp (X | Y) eq/ne Y --> (~X | Y) eq/ne -1 if X is freely invertible. diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll index 6e1486660b24d..f5df8573d6304 100644 --- a/llvm/test/Transforms/InstCombine/icmp.ll +++ b/llvm/test/Transforms/InstCombine/icmp.ll @@ -2954,6 +2954,99 @@ define i1 @or1_eq1(i32 %x) { ret i1 %t1 } +define <2 x i1> @or1_eq1_vec(<2 x i32> %x) { +; CHECK-LABEL: @or1_eq1_vec( +; CHECK-NEXT: [[T1:%.*]] = icmp ult <2 x i32> [[X:%.*]], splat (i32 2) +; CHECK-NEXT: ret <2 x i1> [[T1]] +; + %t0 = or <2 x i32> %x, splat (i32 1) + %t1 = icmp eq <2 x i32> %t0, splat (i32 1) + ret <2 x i1> %t1 +} + +define <2 x i1> @or_eq_vec_nonsplat(<2 x i32> %x) { +; CHECK-LABEL: @or_eq_vec_nonsplat( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[T1:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[T1]] +; + %t0 = or <2 x i32> %x, + %t1 = icmp eq <2 x i32> %t0, + ret <2 x i1> %t1 +} + +define void @or_eq_vec_multiple_nonsplat(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z, ptr %ptr0, ptr %ptr1, ptr %ptr2) { +; CHECK-LABEL: @or_eq_vec_multiple_nonsplat( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: store <2 x i1> [[CMP0]], ptr [[PTR0:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[Y:%.*]], +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq <2 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: store <2 x i1> [[CMP1]], ptr [[PTR1:%.*]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[Z:%.*]], +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq <2 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: store <2 x i1> [[CMP2]], ptr [[PTR2:%.*]], align 1 +; CHECK-NEXT: ret void +; + %t0 = or <2 x i32> %x, + %cmp0 = icmp eq <2 x i32> %t0, + store <2 x i1> %cmp0, ptr %ptr0 + + %t1 = or <2 x i32> %y, + %cmp1 = icmp eq <2 x i32> %t1, + store <2 x i1> %cmp1, ptr %ptr1 + + %t2 = or <2 x i32> %z, + %cmp2 = icmp eq <2 x i32> %t2, + store <2 x i1> %cmp2, ptr %ptr2 + ret void +} + +; Make sure use count of 1 doesn't matter +define i1 @or1_eq1_multiple(i32 %x, i32 %y, i32 %z, ptr %ptr0, ptr %ptr1) { +; CHECK-LABEL: @or1_eq1_multiple( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[X:%.*]], 2 +; CHECK-NEXT: store i1 [[CMP1]], ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[Y:%.*]], 2 +; CHECK-NEXT: store i1 [[CMP2]], ptr [[PTR1:%.*]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i32 [[Z:%.*]], 2 +; CHECK-NEXT: ret i1 [[CMP3]] +; + %t0 = or i32 %x, 1 + %cmp0 = icmp eq i32 %t0, 1 + store i1 %cmp0, ptr %ptr0 + + %t1 = or i32 %y, 1 + %cmp1 = icmp eq i32 %t1, 1 + store i1 %cmp1, ptr %ptr1 + + %t2 = or i32 %z, 1 + %cmp2 = icmp eq i32 %t2, 1 + ret i1 %cmp2 +} + +define <2 x i1> @or1_eq1_multiple_vec(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z, ptr %ptr0, ptr %ptr1) { +; CHECK-LABEL: @or1_eq1_multiple_vec( +; CHECK-NEXT: [[CMP0:%.*]] = icmp ult <2 x i32> [[X:%.*]], splat (i32 2) +; CHECK-NEXT: store <2 x i1> [[CMP0]], ptr [[PTR0:%.*]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <2 x i32> [[Y:%.*]], splat (i32 2) +; CHECK-NEXT: store <2 x i1> [[CMP1]], ptr [[PTR1:%.*]], align 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult <2 x i32> [[Z:%.*]], splat (i32 2) +; CHECK-NEXT: ret <2 x i1> [[CMP2]] +; + %t0 = or <2 x i32> %x, splat (i32 1) + %cmp0 = icmp eq <2 x i32> %t0, splat (i32 1) + store <2 x i1> %cmp0, ptr %ptr0 + + %t1 = or <2 x i32> %y, splat (i32 1) + %cmp1 = icmp eq <2 x i32> %t1, splat (i32 1) + store <2 x i1> %cmp1, ptr %ptr1 + + %t2 = or <2 x i32> %z, splat (i32 1) + %cmp2 = icmp eq <2 x i32> %t2, splat (i32 1) + ret <2 x i1> %cmp2 +} + ; X | C == C --> X <=u C (when C+1 is PowerOf2). define <2 x i1> @or3_eq3_vec(<2 x i8> %x) { From a1331704752c46cd4d954eb8682af230937fe5a6 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 23 Apr 2025 08:53:09 +0000 Subject: [PATCH 034/245] [gn build] Port 4a58071d8726 --- llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn index 502aa13e1de81..a23a5a6c56671 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn @@ -20,6 +20,7 @@ unittest("AMDGPUTests") { "CSETest.cpp", "DwarfRegMappings.cpp", "ExecMayBeModifiedBeforeAnyUse.cpp", + "LiveRegUnits.cpp", "PALMetadata.cpp", ] } From 3cd6b86cc1e1fd1d8d62ca1bcb8498362a4f7b68 Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Wed, 23 Apr 2025 18:11:34 +0900 Subject: [PATCH 035/245] [MachinePipeliner] Use AliasAnalysis properly when analyzing loop-carried dependencies (#136691) MachinePipeliner uses AliasAnalysis to collect loop-carried memory dependencies. To analyze loop-carried dependencies, we need to explicitly tell AliasAnalysis that the values may come from different iterations. Before this patch, MachinePipeliner didn't do this, so some loop-carried dependencies might be missed. For example, in the following case, there is a loop-carried dependency from the load to the store, but it wasn't considered. ``` def @f(ptr noalias %p0, ptr noalias %p1) { entry: br label %body loop: %idx0 = phi ptr [ %p0, %entry ], [ %p1, %body ] %idx1 = phi ptr [ %p1, %entry ], [ %p0, %body ] %v0 = load %idx0 ... store %v1, %idx1 ... } ``` Further, the handling of the underlying objects was not sound. If there is no information about memory operands (i.e., `memoperands()` is empty), it must be handled conservatively. However, Machinepipeliner uses a dummy value (namely `UnknownValue`). It is distinguished from other "known" objects, causing necessary dependencies to be missed. (NOTE: in such cases, `buildSchedGraph` adds non-loop-carried dependencies correctly, so perhaps a critical problem has not occurred.) This patch fixes the above problems. This change has increased false dependencies that didn't exist before. Therefore, this patch also introduces additional alias checks with the underlying objects. Split off from #135148 --- llvm/include/llvm/CodeGen/MachinePipeliner.h | 14 +- llvm/lib/CodeGen/MachinePipeliner.cpp | 225 +++++++++++------- .../Hexagon/swp-alias-cross-iteration.mir | 72 ++++++ llvm/test/CodeGen/Hexagon/swp-no-alias.mir | 151 ++++++++++++ 4 files changed, 371 insertions(+), 91 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir create mode 100644 llvm/test/CodeGen/Hexagon/swp-no-alias.mir diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index fee6937e7d502..966ffb7a1fbd2 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -278,6 +278,13 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { /// Ordered list of DAG postprocessing steps. std::vector> Mutations; + /// Used to compute single-iteration dependencies (i.e., buildSchedGraph). + AliasAnalysis *AA; + + /// Used to compute loop-carried dependencies (i.e., + /// addLoopCarriedDependences). + BatchAAResults BAA; + /// Helper class to implement Johnson's circuit finding algorithm. class Circuits { std::vector &SUnits; @@ -323,13 +330,14 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { public: SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis, const RegisterClassInfo &rci, unsigned II, - TargetInstrInfo::PipelinerLoopInfo *PLI) + TargetInstrInfo::PipelinerLoopInfo *PLI, AliasAnalysis *AA) : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis), RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI), - Topo(SUnits, &ExitSU) { + Topo(SUnits, &ExitSU), AA(AA), BAA(*AA) { P.MF->getSubtarget().getSMSMutations(Mutations); if (SwpEnableCopyToPhi) Mutations.push_back(std::make_unique()); + BAA.enableCrossIterationMode(); } void schedule() override; @@ -394,7 +402,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { const MachineInstr *OtherMI) const; private: - void addLoopCarriedDependences(AAResults *AA); + void addLoopCarriedDependences(); void updatePhiDependences(); void changeDependences(); unsigned calculateResMII(); diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 6cb0299a30d7a..07bffc6c3de90 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -237,6 +237,37 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) +namespace { + +/// This class holds an SUnit corresponding to a memory operation and other +/// information related to the instruction. +struct SUnitWithMemInfo { + SUnit *SU; + SmallVector UnderlyingObjs; + + /// The value of a memory operand. + const Value *MemOpValue = nullptr; + + /// The offset of a memory operand. + int64_t MemOpOffset = 0; + + AAMDNodes AATags; + + /// True if all the underlying objects are identified. + bool IsAllIdentified = false; + + SUnitWithMemInfo(SUnit *SU); + + bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const; + + bool isUnknown() const { return MemOpValue == nullptr; } + +private: + bool getUnderlyingObjects(); +}; + +} // end anonymous namespace + /// The "main" function for implementing Swing Modulo Scheduling. bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { if (skipFunction(mf.getFunction())) @@ -470,9 +501,10 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) { bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) { assert(L.getBlocks().size() == 1 && "SMS works on single blocks only."); + AliasAnalysis *AA = &getAnalysis().getAAResults(); SwingSchedulerDAG SMS( *this, L, getAnalysis().getLIS(), RegClassInfo, - II_setByPragma, LI.LoopPipelinerInfo.get()); + II_setByPragma, LI.LoopPipelinerInfo.get(), AA); MachineBasicBlock *MBB = L.getHeader(); // The kernel should not include any terminator instructions. These @@ -560,9 +592,8 @@ void SwingSchedulerDAG::setMAX_II() { /// We override the schedule function in ScheduleDAGInstrs to implement the /// scheduling part of the Swing Modulo Scheduling algorithm. void SwingSchedulerDAG::schedule() { - AliasAnalysis *AA = &Pass.getAnalysis().getAAResults(); buildSchedGraph(AA); - addLoopCarriedDependences(AA); + addLoopCarriedDependences(); updatePhiDependences(); Topo.InitDAGTopologicalSorting(); changeDependences(); @@ -810,113 +841,131 @@ static bool isDependenceBarrier(MachineInstr &MI) { (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad())); } -/// Return the underlying objects for the memory references of an instruction. +SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) { + if (!getUnderlyingObjects()) + return; + for (const Value *Obj : UnderlyingObjs) + if (!isIdentifiedObject(Obj)) { + IsAllIdentified = false; + break; + } +} + +bool SUnitWithMemInfo::isTriviallyDisjoint( + const SUnitWithMemInfo &Other) const { + // If all underlying objects are identified objects and there is no overlap + // between them, then these two instructions are disjoint. + if (!IsAllIdentified || !Other.IsAllIdentified) + return false; + for (const Value *Obj : UnderlyingObjs) + if (llvm::is_contained(Other.UnderlyingObjs, Obj)) + return false; + return true; +} + +/// Collect the underlying objects for the memory references of an instruction. /// This function calls the code in ValueTracking, but first checks that the /// instruction has a memory operand. -static void getUnderlyingObjects(const MachineInstr *MI, - SmallVectorImpl &Objs) { +/// Returns false if we cannot find the underlying objects. +bool SUnitWithMemInfo::getUnderlyingObjects() { + const MachineInstr *MI = SU->getInstr(); if (!MI->hasOneMemOperand()) - return; + return false; MachineMemOperand *MM = *MI->memoperands_begin(); if (!MM->getValue()) - return; - getUnderlyingObjects(MM->getValue(), Objs); - for (const Value *V : Objs) { - if (!isIdentifiedObject(V)) { - Objs.clear(); - return; - } - } + return false; + MemOpValue = MM->getValue(); + MemOpOffset = MM->getOffset(); + llvm::getUnderlyingObjects(MemOpValue, UnderlyingObjs); + + // TODO: A no alias scope may be valid only in a single iteration. In this + // case we need to peel off it like LoopAccessAnalysis does. + AATags = MM->getAAInfo(); + return true; } /// Add a chain edge between a load and store if the store can be an /// alias of the load on a subsequent iteration, i.e., a loop carried /// dependence. This code is very similar to the code in ScheduleDAGInstrs /// but that code doesn't create loop carried dependences. -void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { - MapVector> PendingLoads; - Value *UnknownValue = - UndefValue::get(Type::getVoidTy(MF.getFunction().getContext())); +void SwingSchedulerDAG::addLoopCarriedDependences() { + SmallVector PendingLoads; for (auto &SU : SUnits) { MachineInstr &MI = *SU.getInstr(); if (isDependenceBarrier(MI)) PendingLoads.clear(); else if (MI.mayLoad()) { - SmallVector Objs; - ::getUnderlyingObjects(&MI, Objs); - if (Objs.empty()) - Objs.push_back(UnknownValue); - for (const auto *V : Objs) { - SmallVector &SUs = PendingLoads[V]; - SUs.push_back(&SU); - } + PendingLoads.emplace_back(&SU); } else if (MI.mayStore()) { - SmallVector Objs; - ::getUnderlyingObjects(&MI, Objs); - if (Objs.empty()) - Objs.push_back(UnknownValue); - for (const auto *V : Objs) { - MapVector>::iterator I = - PendingLoads.find(V); - if (I == PendingLoads.end()) + SUnitWithMemInfo Store(&SU); + for (const SUnitWithMemInfo &Load : PendingLoads) { + if (Load.isTriviallyDisjoint(Store)) continue; - for (auto *Load : I->second) { - if (isSuccOrder(Load, &SU)) - continue; - MachineInstr &LdMI = *Load->getInstr(); - // First, perform the cheaper check that compares the base register. - // If they are the same and the load offset is less than the store - // offset, then mark the dependence as loop carried potentially. - const MachineOperand *BaseOp1, *BaseOp2; - int64_t Offset1, Offset2; - bool Offset1IsScalable, Offset2IsScalable; - if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, - Offset1IsScalable, TRI) && - TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, - Offset2IsScalable, TRI)) { - if (BaseOp1->isIdenticalTo(*BaseOp2) && - Offset1IsScalable == Offset2IsScalable && - (int)Offset1 < (int)Offset2) { - assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) && - "What happened to the chain edge?"); - SDep Dep(Load, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - continue; - } - } - // Second, the more expensive check that uses alias analysis on the - // base registers. If they alias, and the load offset is less than - // the store offset, the mark the dependence as loop carried. - if (!AA) { - SDep Dep(Load, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - continue; - } - MachineMemOperand *MMO1 = *LdMI.memoperands_begin(); - MachineMemOperand *MMO2 = *MI.memoperands_begin(); - if (!MMO1->getValue() || !MMO2->getValue()) { - SDep Dep(Load, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - continue; - } - if (MMO1->getValue() == MMO2->getValue() && - MMO1->getOffset() <= MMO2->getOffset()) { - SDep Dep(Load, SDep::Barrier); + if (isSuccOrder(Load.SU, Store.SU)) + continue; + MachineInstr &LdMI = *Load.SU->getInstr(); + // First, perform the cheaper check that compares the base register. + // If they are the same and the load offset is less than the store + // offset, then mark the dependence as loop carried potentially. + const MachineOperand *BaseOp1, *BaseOp2; + int64_t Offset1, Offset2; + bool Offset1IsScalable, Offset2IsScalable; + if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, + Offset1IsScalable, TRI) && + TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, + Offset2IsScalable, TRI)) { + if (BaseOp1->isIdenticalTo(*BaseOp2) && + Offset1IsScalable == Offset2IsScalable && + (int)Offset1 < (int)Offset2) { + assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) && + "What happened to the chain edge?"); + SDep Dep(Load.SU, SDep::Barrier); Dep.setLatency(1); SU.addPred(Dep); continue; } - if (!AA->isNoAlias( - MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()), - MemoryLocation::getAfter(MMO2->getValue(), - MMO2->getAAInfo()))) { - SDep Dep(Load, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - } + } + // Second, the more expensive check that uses alias analysis on the + // base registers. If they alias, and the load offset is less than + // the store offset, the mark the dependence as loop carried. + if (Load.isUnknown() || Store.isUnknown()) { + SDep Dep(Load.SU, SDep::Barrier); + Dep.setLatency(1); + SU.addPred(Dep); + continue; + } + if (Load.MemOpValue == Store.MemOpValue && + Load.MemOpOffset <= Store.MemOpOffset) { + SDep Dep(Load.SU, SDep::Barrier); + Dep.setLatency(1); + SU.addPred(Dep); + continue; + } + + bool IsNoAlias = [&] { + if (BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(Load.MemOpValue, + Load.AATags), + MemoryLocation::getBeforeOrAfter(Store.MemOpValue, + Store.AATags))) + return true; + + // AliasAnalysis sometimes gives up on following the underlying + // object. In such a case, separate checks for underlying objects may + // prove that there are no aliases between two accesses. + for (const Value *LoadObj : Load.UnderlyingObjs) + for (const Value *StoreObj : Store.UnderlyingObjs) + if (!BAA.isNoAlias( + MemoryLocation::getBeforeOrAfter(LoadObj, Load.AATags), + MemoryLocation::getBeforeOrAfter(StoreObj, Store.AATags))) + return false; + + return true; + }(); + + if (!IsNoAlias) { + SDep Dep(Load.SU, SDep::Barrier); + Dep.setLatency(1); + SU.addPred(Dep); } } } diff --git a/llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir b/llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir new file mode 100644 index 0000000000000..8163074b589d8 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir @@ -0,0 +1,72 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test that pipeliner correctly detects the loop-carried dependency between the +# load and the store, which is indicated by `Ord` dependency from SU(2) to +# SU(4). Note that there is no dependency within a single iteration. + +# CHECK: SU(2): %7:intregs = L2_loadri_io %5:intregs, 0 :: (load (s32) from %ir.ptr.load) +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Predecessors: +# CHECK-NEXT: SU(0): Data Latency=0 Reg=%5 +# CHECK-NEXT: Successors: +# CHECK-DAG: SU(3): Data Latency=2 Reg=%7 +# CHECK-DAG: SU(4): Ord Latency=1 Barrier +# CHECK-NEXT: SU(3): %8:intregs = F2_sfadd %7:intregs, %3:intregs, implicit $usr +# CHECK: SU(4): S2_storeri_io %6:intregs, 0, %8:intregs :: (store (s32) into %ir.ptr.store) + + +--- | + define void @foo(ptr noalias %p0, ptr noalias %p1, i32 %n) { + entry: + br label %body + + body: ; preds = %body, %entry + %i = phi i32 [ 0, %entry ], [ %i.next, %body ] + %ptr.load = phi ptr [ %p0, %entry ], [ %p1, %body ] + %ptr.store = phi ptr [ %p1, %entry ], [ %p0, %body ] + %v = load float, ptr %ptr.load, align 4 + %add = fadd float %v, 1.000000e+00 + store float %add, ptr %ptr.store, align 4 + %i.next = add i32 %i, 1 + %cond = icmp slt i32 %i.next, %n + br i1 %cond, label %body, label %exit + + exit: ; preds = %body + ret void + } +... +--- +name: foo +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2 + + %6:intregs = COPY $r2 + %5:intregs = COPY $r1 + %4:intregs = COPY $r0 + %9:intregs = A2_tfrsi 1065353216 + %12:intregs = COPY %6 + J2_loop0r %bb.1, %12, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.1.body (machine-block-address-taken): + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + %1:intregs = PHI %4, %bb.0, %5, %bb.1 + %2:intregs = PHI %5, %bb.0, %4, %bb.1 + %8:intregs = L2_loadri_io %1, 0 :: (load (s32) from %ir.ptr.load) + %10:intregs = F2_sfadd killed %8, %9, implicit $usr + S2_storeri_io %2, 0, killed %10 :: (store (s32) into %ir.ptr.store) + ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc + + bb.2.exit: + PS_jmpret $r31, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-no-alias.mir b/llvm/test/CodeGen/Hexagon/swp-no-alias.mir new file mode 100644 index 0000000000000..38b7212702ff9 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-no-alias.mir @@ -0,0 +1,151 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test that there are no loop-carried dependencies between all memory instructions. + +# CHECK: SU(0): %8:intregs = PHI %1:intregs, %bb.1, %9:intregs, %bb.2 +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Successors: +# CHECK-DAG: SU(6): Data Latency=0 Reg=%8 +# CHECK-DAG: SU(5): Data Latency=0 Reg=%8 +# CHECK-DAG: SU(3): Data Latency=0 Reg=%8 +# CHECK-DAG: SU(6): Anti Latency=1 +# CHECK-NEXT: SU(1): %10:intregs = PHI %2:intregs, %bb.1, %11:intregs, %bb.2 +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Successors: +# CHECK-DAG: SU(7): Data Latency=0 Reg=%10 +# CHECK-DAG: SU(4): Data Latency=0 Reg=%10 +# CHECK-DAG: SU(2): Data Latency=0 Reg=%10 +# CHECK-DAG: SU(7): Anti Latency=1 +# CHECK-NEXT: SU(2): %12:hvxvr = V6_vL32b_ai %10:intregs, 0 :: (load (s1024) from %ir.iptr.09, !tbaa !4) +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Predecessors: +# CHECK-NEXT: SU(1): Data Latency=0 Reg=%10 +# CHECK-NEXT: Successors: +# CHECK-NEXT: SU(3): Data Latency=0 Reg=%12 +# CHECK-NEXT: SU(3): V6_vS32b_ai %8:intregs, 0, %12:hvxvr :: (store (s1024) into %ir.optr.010, !tbaa !4) +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Predecessors: +# CHECK-DAG: SU(2): Data Latency=0 Reg=%12 +# CHECK-DAG: SU(0): Data Latency=0 Reg=%8 +# CHECK-NEXT: SU(4): %13:hvxvr = V6_vL32b_ai %10:intregs, 128 :: (load (s1024) from %ir.cgep, !tbaa !4) +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Predecessors: +# CHECK-NEXT: SU(1): Data Latency=0 Reg=%10 +# CHECK-NEXT: Successors: +# CHECK-NEXT: SU(5): Data Latency=0 Reg=%13 +# CHECK-NEXT: SU(5): V6_vS32b_ai %8:intregs, 128, %13:hvxvr :: (store (s1024) into %ir.cgep3, !tbaa !4) + + + + +--- | + define dso_local void @foo(ptr noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out, i32 noundef %width) local_unnamed_addr #0 { + entry: + %cmp7 = icmp sgt i32 %width, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + + for.body.preheader: ; preds = %entry + %0 = add i32 %width, 128 + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %optr.010 = phi ptr [ %cgep4, %for.body ], [ %out, %for.body.preheader ] + %iptr.09 = phi ptr [ %cgep5, %for.body ], [ %in, %for.body.preheader ] + %ald = load <128 x i8>, ptr %iptr.09, align 128, !tbaa !4 + %cst = bitcast <128 x i8> %ald to <32 x i32> + store <32 x i32> %cst, ptr %optr.010, align 128, !tbaa !4 + %cgep = getelementptr i8, ptr %iptr.09, i32 128 + %ald1 = load <128 x i8>, ptr %cgep, align 128, !tbaa !4 + %cst2 = bitcast <128 x i8> %ald1 to <32 x i32> + %cgep3 = getelementptr i8, ptr %optr.010, i32 128 + store <32 x i32> %cst2, ptr %cgep3, align 128, !tbaa !4 + %lsr.iv.next = add i32 %lsr.iv, -128 + %cmp = icmp samesign ugt i32 %lsr.iv.next, 128 + %cgep4 = getelementptr i8, ptr %optr.010, i32 256 + %cgep5 = getelementptr i8, ptr %iptr.09, i32 256 + br i1 %cmp, label %for.body, label %for.end + + for.end: ; preds = %for.body, %entry + ret void + } + + attributes #0 = { "target-cpu"="hexagonv60" "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" } + + !llvm.module.flags = !{!0, !1, !2, !3} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 8, !"PIC Level", i32 2} + !2 = !{i32 7, !"PIE Level", i32 2} + !3 = !{i32 7, !"frame-pointer", i32 2} + !4 = !{!5, !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C/C++ TBAA"} +... +--- +name: foo +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $r0, $r1, $r2 + + %9:intregs = COPY $r2 + %8:intregs = COPY $r1 + %7:intregs = COPY $r0 + %10:predregs = C2_cmpgti %9, 0 + J2_jumpf %10, %bb.3, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + + %0:intregs = A2_addi %9, 128 + %15:intregs = A2_addi %0, -1 + %16:intregs = S2_lsr_i_r %15, 7 + %17:intregs = COPY %16 + J2_loop0r %bb.2, %17, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.2.for.body (machine-block-address-taken): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + + %2:intregs = PHI %8, %bb.1, %5, %bb.2 + %3:intregs = PHI %7, %bb.1, %6, %bb.2 + %12:hvxvr = V6_vL32b_ai %3, 0 :: (load (s1024) from %ir.iptr.09, !tbaa !4) + V6_vS32b_ai %2, 0, killed %12 :: (store (s1024) into %ir.optr.010, !tbaa !4) + %13:hvxvr = V6_vL32b_ai %3, 128 :: (load (s1024) from %ir.cgep, !tbaa !4) + V6_vS32b_ai %2, 128, killed %13 :: (store (s1024) into %ir.cgep3, !tbaa !4) + %5:intregs = A2_addi %2, 256 + %6:intregs = A2_addi %3, 256 + ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.3, implicit-def dead $pc + + bb.3.for.end: + PS_jmpret $r31, implicit-def dead $pc +... From 0de2f64e652a1b8c1e051635c98fb2b69c6b2c62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Wed, 23 Apr 2025 11:40:36 +0200 Subject: [PATCH 036/245] [clang] XFAIL the `Xclangas.s` test on AIX. (#136744) Clang on AIX does not use the integrated assembler. https://github.com/llvm/llvm-project/pull/100714#issuecomment-2822056054 --- clang/test/Driver/Xclangas.s | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/test/Driver/Xclangas.s b/clang/test/Driver/Xclangas.s index 0d3911170eee1..fb6ae2ecbb189 100644 --- a/clang/test/Driver/Xclangas.s +++ b/clang/test/Driver/Xclangas.s @@ -2,3 +2,4 @@ // RUN: %clang -### -Werror -Xclangas -target-feature -Xclangas=+v5t %s 2>&1 | FileCheck %s // CHECK: -cc1as // CHECK-SAME: "-target-feature" "+v5t" +// XFAIL: target={{.*}}-aix{{.*}} From 11a3de7e98785b0df8f2010fb22c10c0590d2707 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Wed, 23 Apr 2025 12:43:05 +0300 Subject: [PATCH 037/245] [SDag][ARM][RISCV] Allow lowering CTPOP into a libcall (#101786) This is a reland of #99752 with the bug fixed (see test diff in the third commit in this PR). All `popcount` libcalls return `int`, but `ISD::CTPOP` returns the type of the argument, which can be wider than `int`. The fix is to make DAG legalizer pass the correct return type to `makeLibCall` and sign-extend the result afterwards. Original commit message: The main change is adding CTPOP to `RuntimeLibcalls.def` to allow targets to use LibCall action for CTPOP. DAG legalizers are changed accordingly. Pull Request: https://github.com/llvm/llvm-project/pull/101786 --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 2 +- llvm/include/llvm/IR/RuntimeLibcalls.def | 3 + llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 82 +- .../SelectionDAG/LegalizeIntegerTypes.cpp | 34 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 5 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 7 +- llvm/test/CodeGen/ARM/popcnt.ll | 70 +- llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll | 1087 +++-------------- .../CodeGen/RISCV/ctz_zero_return_test.ll | 118 +- llvm/test/CodeGen/RISCV/pr56457.ll | 49 +- llvm/test/CodeGen/RISCV/pr95271.ll | 23 +- llvm/test/CodeGen/RISCV/rv32xtheadbb.ll | 104 +- llvm/test/CodeGen/RISCV/rv32zbb.ll | 330 ++--- llvm/test/CodeGen/RISCV/rv64xtheadbb.ll | 54 +- llvm/test/CodeGen/RISCV/rv64zbb.ll | 149 +-- llvm/test/CodeGen/RISCV/sextw-removal.ll | 46 +- llvm/test/CodeGen/Thumb2/mve-ctpop.ll | 63 +- 18 files changed, 547 insertions(+), 1682 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 80df6d7d956d3..edb7701402205 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2503,7 +2503,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return (LT.first * 2); else return (LT.first * 1); - } else if (!TLI->isOperationExpand(ISD, LT.second)) { + } else if (TLI->isOperationCustom(ISD, LT.second)) { // If the operation is custom lowered then assume // that the code is twice as expensive. return (LT.first * 2); diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index 2545aebc73391..cd8e9b598044c 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -85,6 +85,9 @@ HANDLE_LIBCALL(NEG_I64, "__negdi2") HANDLE_LIBCALL(CTLZ_I32, "__clzsi2") HANDLE_LIBCALL(CTLZ_I64, "__clzdi2") HANDLE_LIBCALL(CTLZ_I128, "__clzti2") +HANDLE_LIBCALL(CTPOP_I32, "__popcountsi2") +HANDLE_LIBCALL(CTPOP_I64, "__popcountdi2") +HANDLE_LIBCALL(CTPOP_I128, "__popcountti2") // Floating-point HANDLE_LIBCALL(ADD_F32, "__addsf3") diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index b596f6b8d6ce2..3e47136edbefc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -129,7 +129,8 @@ class SelectionDAGLegalize { ArrayRef Mask) const; std::pair ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, - TargetLowering::ArgListTy &&Args, bool isSigned); + TargetLowering::ArgListTy &&Args, + bool IsSigned, EVT RetVT); std::pair ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned); void ExpandFPLibCall(SDNode *Node, RTLIB::Libcall LC, @@ -150,6 +151,9 @@ class SelectionDAGLegalize { RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128, RTLIB::Libcall Call_PPCF128, SmallVectorImpl &Results); + SDValue ExpandBitCountingLibCall(SDNode *Node, RTLIB::Libcall CallI32, + RTLIB::Libcall CallI64, + RTLIB::Libcall CallI128); void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl &Results); void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl &Results); @@ -2114,9 +2118,10 @@ SDValue SelectionDAGLegalize::ExpandSPLAT_VECTOR(SDNode *Node) { // register, return the lo part and set the hi part to the by-reg argument in // the first. If it does fit into a single register, return the result and // leave the Hi part unset. -std::pair SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, - TargetLowering::ArgListTy &&Args, - bool isSigned) { +std::pair +SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, + TargetLowering::ArgListTy &&Args, + bool IsSigned, EVT RetVT) { EVT CodePtrTy = TLI.getPointerTy(DAG.getDataLayout()); SDValue Callee; if (const char *LibcallName = TLI.getLibcallName(LC)) @@ -2127,7 +2132,6 @@ std::pair SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L Node->getOperationName(&DAG)); } - EVT RetVT = Node->getValueType(0); Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); // By default, the input chain to this libcall is the entry node of the @@ -2147,7 +2151,7 @@ std::pair SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L InChain = TCChain; TargetLowering::CallLoweringInfo CLI(DAG); - bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetTy, isSigned); + bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetTy, IsSigned); CLI.setDebugLoc(SDLoc(Node)) .setChain(InChain) .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, @@ -2183,7 +2187,8 @@ std::pair SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L Args.push_back(Entry); } - return ExpandLibCall(LC, Node, std::move(Args), isSigned); + return ExpandLibCall(LC, Node, std::move(Args), isSigned, + Node->getValueType(0)); } void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node, @@ -2259,6 +2264,50 @@ void SelectionDAGLegalize::ExpandArgFPLibCall(SDNode* Node, ExpandFPLibCall(Node, LC, Results); } +SDValue SelectionDAGLegalize::ExpandBitCountingLibCall( + SDNode *Node, RTLIB::Libcall CallI32, RTLIB::Libcall CallI64, + RTLIB::Libcall CallI128) { + RTLIB::Libcall LC; + switch (Node->getSimpleValueType(0).SimpleTy) { + default: + llvm_unreachable("Unexpected request for libcall!"); + case MVT::i32: + LC = CallI32; + break; + case MVT::i64: + LC = CallI64; + break; + case MVT::i128: + LC = CallI128; + break; + } + + // Bit-counting libcalls have one unsigned argument and return `int`. + // Note that `int` may be illegal on this target; ExpandLibCall will + // take care of promoting it to a legal type. + SDValue Op = Node->getOperand(0); + EVT IntVT = + EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize()); + + TargetLowering::ArgListEntry Arg; + EVT ArgVT = Op.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Arg.Node = Op; + Arg.Ty = ArgTy; + Arg.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, /*IsSigned=*/false); + Arg.IsZExt = !Arg.IsSExt; + + SDValue Res = ExpandLibCall(LC, Node, TargetLowering::ArgListTy{Arg}, + /*IsSigned=*/true, IntVT) + .first; + + // If ExpandLibCall created a tail call, the result was already + // of the correct type. Otherwise, we need to sign extend it. + if (Res.getValueType() != MVT::Other) + Res = DAG.getSExtOrTrunc(Res, SDLoc(Node), Node->getValueType(0)); + return Res; +} + /// Issue libcalls to __{u}divmod to compute div / rem pairs. void SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, @@ -4993,19 +5042,12 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::MUL_I64, RTLIB::MUL_I128)); break; case ISD::CTLZ_ZERO_UNDEF: - switch (Node->getSimpleValueType(0).SimpleTy) { - default: - llvm_unreachable("LibCall explicitly requested, but not available"); - case MVT::i32: - Results.push_back(ExpandLibCall(RTLIB::CTLZ_I32, Node, false).first); - break; - case MVT::i64: - Results.push_back(ExpandLibCall(RTLIB::CTLZ_I64, Node, false).first); - break; - case MVT::i128: - Results.push_back(ExpandLibCall(RTLIB::CTLZ_I128, Node, false).first); - break; - } + Results.push_back(ExpandBitCountingLibCall( + Node, RTLIB::CTLZ_I32, RTLIB::CTLZ_I64, RTLIB::CTLZ_I128)); + break; + case ISD::CTPOP: + Results.push_back(ExpandBitCountingLibCall( + Node, RTLIB::CTPOP_I32, RTLIB::CTPOP_I64, RTLIB::CTPOP_I128)); break; case ISD::RESET_FPENV: { // It is legalized to call 'fesetenv(FE_DFL_ENV)'. On most targets diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 53244a990a864..83dd519fb2ea4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -4012,15 +4012,35 @@ void DAGTypeLegalizer::ExpandIntRes_ABD(SDNode *N, SDValue &Lo, SDValue &Hi) { SplitInteger(Result, Lo, Hi); } -void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N, - SDValue &Lo, SDValue &Hi) { - SDLoc dl(N); +void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N, SDValue &Lo, SDValue &Hi) { + SDValue Op = N->getOperand(0); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + if (TLI.getOperationAction(ISD::CTPOP, VT) == TargetLoweringBase::LibCall) { + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + if (VT == MVT::i32) + LC = RTLIB::CTPOP_I32; + else if (VT == MVT::i64) + LC = RTLIB::CTPOP_I64; + else if (VT == MVT::i128) + LC = RTLIB::CTPOP_I128; + assert(LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC) && + "LibCall explicitly requested, but not available"); + TargetLowering::MakeLibCallOptions CallOptions; + EVT IntVT = + EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize()); + SDValue Res = TLI.makeLibCall(DAG, LC, IntVT, Op, CallOptions, DL).first; + SplitInteger(DAG.getSExtOrTrunc(Res, DL, VT), Lo, Hi); + return; + } + // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo) - GetExpandedInteger(N->getOperand(0), Lo, Hi); + GetExpandedInteger(Op, Lo, Hi); EVT NVT = Lo.getValueType(); - Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo), - DAG.getNode(ISD::CTPOP, dl, NVT, Hi)); - Hi = DAG.getConstant(0, dl, NVT); + Lo = DAG.getNode(ISD::ADD, DL, NVT, DAG.getNode(ISD::CTPOP, DL, NVT, Lo), + DAG.getNode(ISD::CTPOP, DL, NVT, Hi)); + Hi = DAG.getConstant(0, DL, NVT); } void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N, diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 3995216e3d689..3362677b361ec 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9390,8 +9390,9 @@ SDValue TargetLowering::expandCTTZ(SDNode *Node, SelectionDAG &DAG) const { !isOperationLegalOrCustomOrPromote(ISD::XOR, VT))) return SDValue(); - // Emit Table Lookup if ISD::CTLZ and ISD::CTPOP are not legal. - if (!VT.isVector() && isOperationExpand(ISD::CTPOP, VT) && + // Emit Table Lookup if ISD::CTPOP used in the fallback path below is going + // to be expanded or converted to a libcall. + if (!VT.isVector() && !isOperationLegalOrCustomOrPromote(ISD::CTPOP, VT) && !isOperationLegal(ISD::CTLZ, VT)) if (SDValue V = CTTZTableLookup(Node, DAG, dl, VT, Op, NumBitsPerElt)) return V; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2290ac2728c6d..bdebd842b011c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1221,7 +1221,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ROTR, VT, Expand); } setOperationAction(ISD::CTTZ, MVT::i32, Custom); - setOperationAction(ISD::CTPOP, MVT::i32, Expand); + setOperationAction(ISD::CTPOP, MVT::i32, LibCall); + setOperationAction(ISD::CTPOP, MVT::i64, LibCall); if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { setOperationAction(ISD::CTLZ, MVT::i32, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index dadae2e71d44c..a75bd54ef5435 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -395,7 +395,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.is64Bit()) setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); } else { - setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand); + setOperationAction(ISD::CTTZ, XLenVT, Expand); + if (Subtarget.is64Bit()) + setOperationAction(ISD::CTPOP, MVT::i128, LibCall); + else + setOperationAction(ISD::CTPOP, MVT::i32, LibCall); + setOperationAction(ISD::CTPOP, MVT::i64, LibCall); } if (Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb() || diff --git a/llvm/test/CodeGen/ARM/popcnt.ll b/llvm/test/CodeGen/ARM/popcnt.ll index edcae5e141e73..fc4387320ef77 100644 --- a/llvm/test/CodeGen/ARM/popcnt.ll +++ b/llvm/test/CodeGen/ARM/popcnt.ll @@ -324,76 +324,22 @@ define i32 @ctpop16(i16 %x) nounwind readnone { define i32 @ctpop32(i32 %x) nounwind readnone { ; CHECK-LABEL: ctpop32: ; CHECK: @ %bb.0: -; CHECK-NEXT: ldr r1, .LCPI22_0 -; CHECK-NEXT: ldr r2, .LCPI22_3 -; CHECK-NEXT: and r1, r1, r0, lsr #1 -; CHECK-NEXT: ldr r12, .LCPI22_1 -; CHECK-NEXT: sub r0, r0, r1 -; CHECK-NEXT: ldr r3, .LCPI22_2 -; CHECK-NEXT: and r1, r0, r2 -; CHECK-NEXT: and r0, r2, r0, lsr #2 -; CHECK-NEXT: add r0, r1, r0 -; CHECK-NEXT: add r0, r0, r0, lsr #4 -; CHECK-NEXT: and r0, r0, r12 -; CHECK-NEXT: mul r1, r0, r3 -; CHECK-NEXT: lsr r0, r1, #24 -; CHECK-NEXT: mov pc, lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI22_0: -; CHECK-NEXT: .long 1431655765 @ 0x55555555 -; CHECK-NEXT: .LCPI22_1: -; CHECK-NEXT: .long 252645135 @ 0xf0f0f0f -; CHECK-NEXT: .LCPI22_2: -; CHECK-NEXT: .long 16843009 @ 0x1010101 -; CHECK-NEXT: .LCPI22_3: -; CHECK-NEXT: .long 858993459 @ 0x33333333 +; CHECK-NEXT: b __popcountsi2 %count = tail call i32 @llvm.ctpop.i32(i32 %x) ret i32 %count } -define i32 @ctpop64(i64 %x) nounwind readnone { +define i64 @ctpop64(i64 %x) nounwind readnone { ; CHECK-LABEL: ctpop64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldr r2, .LCPI23_0 -; CHECK-NEXT: ldr r3, .LCPI23_3 -; CHECK-NEXT: and r4, r2, r0, lsr #1 -; CHECK-NEXT: and r2, r2, r1, lsr #1 -; CHECK-NEXT: sub r0, r0, r4 -; CHECK-NEXT: sub r1, r1, r2 -; CHECK-NEXT: and r4, r0, r3 -; CHECK-NEXT: and r2, r1, r3 -; CHECK-NEXT: and r0, r3, r0, lsr #2 -; CHECK-NEXT: and r1, r3, r1, lsr #2 -; CHECK-NEXT: add r0, r4, r0 -; CHECK-NEXT: ldr lr, .LCPI23_1 -; CHECK-NEXT: add r1, r2, r1 -; CHECK-NEXT: ldr r12, .LCPI23_2 -; CHECK-NEXT: add r0, r0, r0, lsr #4 -; CHECK-NEXT: and r0, r0, lr -; CHECK-NEXT: add r1, r1, r1, lsr #4 -; CHECK-NEXT: mul r2, r0, r12 -; CHECK-NEXT: and r0, r1, lr -; CHECK-NEXT: mul r1, r0, r12 -; CHECK-NEXT: lsr r0, r2, #24 -; CHECK-NEXT: add r0, r0, r1, lsr #24 -; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl __popcountdi2 +; CHECK-NEXT: asr r1, r0, #31 +; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI23_0: -; CHECK-NEXT: .long 1431655765 @ 0x55555555 -; CHECK-NEXT: .LCPI23_1: -; CHECK-NEXT: .long 252645135 @ 0xf0f0f0f -; CHECK-NEXT: .LCPI23_2: -; CHECK-NEXT: .long 16843009 @ 0x1010101 -; CHECK-NEXT: .LCPI23_3: -; CHECK-NEXT: .long 858993459 @ 0x33333333 %count = tail call i64 @llvm.ctpop.i64(i64 %x) - %conv = trunc i64 %count to i32 - ret i32 %conv + ret i64 %count } define i32 @ctpop_eq_one(i64 %x) nounwind readnone { diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index a46168f114bb9..f8c3a75f844db 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -1156,46 +1156,30 @@ define i16 @test_ctlz_i16(i16 %a) nounwind { } define i32 @test_ctlz_i32(i32 %a) nounwind { -; RV32I-LABEL: test_ctlz_i32: -; RV32I: # %bb.0: -; RV32I-NEXT: beqz a0, .LBB10_2 -; RV32I-NEXT: # %bb.1: # %cond.false -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: addi a1, a2, 1365 -; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 8 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 16 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: and a1, a2, a1 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: addi a1, a2, -241 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB10_2: -; RV32I-NEXT: li a0, 32 -; RV32I-NEXT: ret +; RV32_NOZBB-LABEL: test_ctlz_i32: +; RV32_NOZBB: # %bb.0: +; RV32_NOZBB-NEXT: beqz a0, .LBB10_2 +; RV32_NOZBB-NEXT: # %bb.1: # %cond.false +; RV32_NOZBB-NEXT: addi sp, sp, -16 +; RV32_NOZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32_NOZBB-NEXT: srli a1, a0, 1 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 2 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 4 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 16 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: not a0, a0 +; RV32_NOZBB-NEXT: call __popcountsi2 +; RV32_NOZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32_NOZBB-NEXT: addi sp, sp, 16 +; RV32_NOZBB-NEXT: ret +; RV32_NOZBB-NEXT: .LBB10_2: +; RV32_NOZBB-NEXT: li a0, 32 +; RV32_NOZBB-NEXT: ret ; ; RV64I-LABEL: test_ctlz_i32: ; RV64I: # %bb.0: @@ -1239,46 +1223,6 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV64I-NEXT: li a0, 32 ; RV64I-NEXT: ret ; -; RV32M-LABEL: test_ctlz_i32: -; RV32M: # %bb.0: -; RV32M-NEXT: beqz a0, .LBB10_2 -; RV32M-NEXT: # %bb.1: # %cond.false -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: lui a2, 349525 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: addi a1, a2, 1365 -; RV32M-NEXT: srli a2, a0, 2 -; RV32M-NEXT: or a0, a0, a2 -; RV32M-NEXT: srli a2, a0, 4 -; RV32M-NEXT: or a0, a0, a2 -; RV32M-NEXT: srli a2, a0, 8 -; RV32M-NEXT: or a0, a0, a2 -; RV32M-NEXT: srli a2, a0, 16 -; RV32M-NEXT: or a0, a0, a2 -; RV32M-NEXT: not a0, a0 -; RV32M-NEXT: srli a2, a0, 1 -; RV32M-NEXT: and a1, a2, a1 -; RV32M-NEXT: lui a2, 209715 -; RV32M-NEXT: addi a2, a2, 819 -; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: and a1, a0, a2 -; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: lui a2, 61681 -; RV32M-NEXT: add a0, a1, a0 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: lui a1, 4112 -; RV32M-NEXT: addi a2, a2, -241 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: addi a1, a1, 257 -; RV32M-NEXT: mul a0, a0, a1 -; RV32M-NEXT: srli a0, a0, 24 -; RV32M-NEXT: ret -; RV32M-NEXT: .LBB10_2: -; RV32M-NEXT: li a0, 32 -; RV32M-NEXT: ret -; ; RV64M-LABEL: test_ctlz_i32: ; RV64M: # %bb.0: ; RV64M-NEXT: sext.w a1, a0 @@ -1346,240 +1290,75 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { } define i64 @test_ctlz_i64(i64 %a) nounwind { -; RV32I-LABEL: test_ctlz_i64: -; RV32I: # %bb.0: -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: lui a3, 209715 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: addi a3, a3, 819 -; RV32I-NEXT: addi a2, a5, -241 -; RV32I-NEXT: bnez a1, .LBB11_2 -; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi a0, a0, 32 -; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB11_2: -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_ctlz_i64: -; RV64I: # %bb.0: -; RV64I-NEXT: beqz a0, .LBB11_2 -; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: lui a3, 209715 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: addiw a1, a2, 1365 -; RV64I-NEXT: addiw a2, a3, 819 -; RV64I-NEXT: srli a3, a0, 2 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a3, a1, 32 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: srli a3, a0, 4 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 8 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 32 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a3, a0, 1 -; RV64I-NEXT: and a1, a3, a1 -; RV64I-NEXT: lui a3, 61681 -; RV64I-NEXT: addiw a3, a3, -241 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: and a1, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a2, a3, 32 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: add a2, a3, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a1, a0, 8 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 16 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ret -; RV64I-NEXT: .LBB11_2: -; RV64I-NEXT: li a0, 64 -; RV64I-NEXT: ret -; -; RV32M-LABEL: test_ctlz_i64: -; RV32M: # %bb.0: -; RV32M-NEXT: lui a2, 349525 -; RV32M-NEXT: lui a3, 209715 -; RV32M-NEXT: lui a6, 61681 -; RV32M-NEXT: lui a7, 4112 -; RV32M-NEXT: addi a5, a2, 1365 -; RV32M-NEXT: addi a4, a3, 819 -; RV32M-NEXT: addi a3, a6, -241 -; RV32M-NEXT: addi a2, a7, 257 -; RV32M-NEXT: bnez a1, .LBB11_2 -; RV32M-NEXT: # %bb.1: -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 2 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 8 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 16 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: not a0, a0 -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: and a1, a1, a5 -; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: and a1, a0, a4 -; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a4 -; RV32M-NEXT: add a0, a1, a0 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: and a0, a0, a3 -; RV32M-NEXT: mul a0, a0, a2 -; RV32M-NEXT: srli a0, a0, 24 -; RV32M-NEXT: addi a0, a0, 32 -; RV32M-NEXT: li a1, 0 -; RV32M-NEXT: ret -; RV32M-NEXT: .LBB11_2: -; RV32M-NEXT: srli a0, a1, 1 -; RV32M-NEXT: or a0, a1, a0 -; RV32M-NEXT: srli a1, a0, 2 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 8 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 16 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: not a0, a0 -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: and a1, a1, a5 -; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: and a1, a0, a4 -; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a4 -; RV32M-NEXT: add a0, a1, a0 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: and a0, a0, a3 -; RV32M-NEXT: mul a0, a0, a2 -; RV32M-NEXT: srli a0, a0, 24 -; RV32M-NEXT: li a1, 0 -; RV32M-NEXT: ret +; RV32_NOZBB-LABEL: test_ctlz_i64: +; RV32_NOZBB: # %bb.0: +; RV32_NOZBB-NEXT: addi sp, sp, -16 +; RV32_NOZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32_NOZBB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32_NOZBB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32_NOZBB-NEXT: mv s1, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 1 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 2 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 4 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 16 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: not a0, a0 +; RV32_NOZBB-NEXT: call __popcountsi2 +; RV32_NOZBB-NEXT: mv s0, a0 +; RV32_NOZBB-NEXT: srli a0, s1, 1 +; RV32_NOZBB-NEXT: or a0, s1, a0 +; RV32_NOZBB-NEXT: srli a1, a0, 2 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 4 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 16 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: not a0, a0 +; RV32_NOZBB-NEXT: call __popcountsi2 +; RV32_NOZBB-NEXT: bnez s1, .LBB11_2 +; RV32_NOZBB-NEXT: # %bb.1: +; RV32_NOZBB-NEXT: addi a0, s0, 32 +; RV32_NOZBB-NEXT: .LBB11_2: +; RV32_NOZBB-NEXT: li a1, 0 +; RV32_NOZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32_NOZBB-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32_NOZBB-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32_NOZBB-NEXT: addi sp, sp, 16 +; RV32_NOZBB-NEXT: ret ; -; RV64M-LABEL: test_ctlz_i64: -; RV64M: # %bb.0: -; RV64M-NEXT: beqz a0, .LBB11_2 -; RV64M-NEXT: # %bb.1: # %cond.false -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 349525 -; RV64M-NEXT: lui a3, 209715 -; RV64M-NEXT: lui a4, 61681 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: addiw a1, a2, 1365 -; RV64M-NEXT: addiw a2, a3, 819 -; RV64M-NEXT: addiw a3, a4, -241 -; RV64M-NEXT: srli a4, a0, 2 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: slli a4, a1, 32 -; RV64M-NEXT: add a1, a1, a4 -; RV64M-NEXT: slli a4, a2, 32 -; RV64M-NEXT: add a2, a2, a4 -; RV64M-NEXT: slli a4, a3, 32 -; RV64M-NEXT: add a3, a3, a4 -; RV64M-NEXT: srli a4, a0, 4 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: srli a4, a0, 8 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: srli a4, a0, 16 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: srli a4, a0, 32 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: not a0, a0 -; RV64M-NEXT: srli a4, a0, 1 -; RV64M-NEXT: and a1, a4, a1 -; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: and a1, a0, a2 -; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a2 -; RV64M-NEXT: lui a2, 4112 -; RV64M-NEXT: addiw a2, a2, 257 -; RV64M-NEXT: add a0, a1, a0 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: add a0, a0, a1 -; RV64M-NEXT: slli a1, a2, 32 -; RV64M-NEXT: and a0, a0, a3 -; RV64M-NEXT: add a1, a2, a1 -; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: srli a0, a0, 56 -; RV64M-NEXT: ret -; RV64M-NEXT: .LBB11_2: -; RV64M-NEXT: li a0, 64 -; RV64M-NEXT: ret +; RV64NOZBB-LABEL: test_ctlz_i64: +; RV64NOZBB: # %bb.0: +; RV64NOZBB-NEXT: beqz a0, .LBB11_2 +; RV64NOZBB-NEXT: # %bb.1: # %cond.false +; RV64NOZBB-NEXT: addi sp, sp, -16 +; RV64NOZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64NOZBB-NEXT: srli a1, a0, 1 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 2 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 4 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 8 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 16 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 32 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: not a0, a0 +; RV64NOZBB-NEXT: call __popcountdi2 +; RV64NOZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64NOZBB-NEXT: addi sp, sp, 16 +; RV64NOZBB-NEXT: ret +; RV64NOZBB-NEXT: .LBB11_2: +; RV64NOZBB-NEXT: li a0, 64 +; RV64NOZBB-NEXT: ret ; ; RV32ZBB-LABEL: test_ctlz_i64: ; RV32ZBB: # %bb.0: @@ -1793,41 +1572,20 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { } define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { -; RV32I-LABEL: test_ctlz_i32_zero_undef: -; RV32I: # %bb.0: -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: addi a1, a2, 1365 -; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 8 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 16 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: and a1, a2, a1 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: addi a1, a2, -241 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: ret +; RV32_NOZBB-LABEL: test_ctlz_i32_zero_undef: +; RV32_NOZBB: # %bb.0: +; RV32_NOZBB-NEXT: srli a1, a0, 1 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 2 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 4 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 16 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: not a0, a0 +; RV32_NOZBB-NEXT: tail __popcountsi2 ; ; RV64I-LABEL: test_ctlz_i32_zero_undef: ; RV64I: # %bb.0: @@ -1865,41 +1623,6 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: ret ; -; RV32M-LABEL: test_ctlz_i32_zero_undef: -; RV32M: # %bb.0: -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: lui a2, 349525 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: addi a1, a2, 1365 -; RV32M-NEXT: srli a2, a0, 2 -; RV32M-NEXT: or a0, a0, a2 -; RV32M-NEXT: srli a2, a0, 4 -; RV32M-NEXT: or a0, a0, a2 -; RV32M-NEXT: srli a2, a0, 8 -; RV32M-NEXT: or a0, a0, a2 -; RV32M-NEXT: srli a2, a0, 16 -; RV32M-NEXT: or a0, a0, a2 -; RV32M-NEXT: not a0, a0 -; RV32M-NEXT: srli a2, a0, 1 -; RV32M-NEXT: and a1, a2, a1 -; RV32M-NEXT: lui a2, 209715 -; RV32M-NEXT: addi a2, a2, 819 -; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: and a1, a0, a2 -; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: lui a2, 61681 -; RV32M-NEXT: add a0, a1, a0 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: lui a1, 4112 -; RV32M-NEXT: addi a2, a2, -241 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: addi a1, a1, 257 -; RV32M-NEXT: mul a0, a0, a1 -; RV32M-NEXT: srli a0, a0, 24 -; RV32M-NEXT: ret -; ; RV64M-LABEL: test_ctlz_i32_zero_undef: ; RV64M: # %bb.0: ; RV64M-NEXT: srliw a1, a0, 1 @@ -1961,230 +1684,70 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { } define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { -; RV32I-LABEL: test_ctlz_i64_zero_undef: -; RV32I: # %bb.0: -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: lui a3, 209715 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: addi a3, a3, 819 -; RV32I-NEXT: addi a2, a5, -241 -; RV32I-NEXT: bnez a1, .LBB15_2 -; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi a0, a0, 32 -; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB15_2: -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_ctlz_i64_zero_undef: -; RV64I: # %bb.0: -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: lui a3, 209715 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: addiw a1, a2, 1365 -; RV64I-NEXT: addiw a2, a3, 819 -; RV64I-NEXT: srli a3, a0, 2 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a3, a1, 32 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: srli a3, a0, 4 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 8 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 32 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a3, a0, 1 -; RV64I-NEXT: and a1, a3, a1 -; RV64I-NEXT: lui a3, 61681 -; RV64I-NEXT: addiw a3, a3, -241 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: and a1, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a2, a3, 32 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: add a2, a3, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a1, a0, 8 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 16 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ret -; -; RV32M-LABEL: test_ctlz_i64_zero_undef: -; RV32M: # %bb.0: -; RV32M-NEXT: lui a2, 349525 -; RV32M-NEXT: lui a3, 209715 -; RV32M-NEXT: lui a6, 61681 -; RV32M-NEXT: lui a7, 4112 -; RV32M-NEXT: addi a5, a2, 1365 -; RV32M-NEXT: addi a4, a3, 819 -; RV32M-NEXT: addi a3, a6, -241 -; RV32M-NEXT: addi a2, a7, 257 -; RV32M-NEXT: bnez a1, .LBB15_2 -; RV32M-NEXT: # %bb.1: -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 2 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 8 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 16 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: not a0, a0 -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: and a1, a1, a5 -; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: and a1, a0, a4 -; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a4 -; RV32M-NEXT: add a0, a1, a0 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: and a0, a0, a3 -; RV32M-NEXT: mul a0, a0, a2 -; RV32M-NEXT: srli a0, a0, 24 -; RV32M-NEXT: addi a0, a0, 32 -; RV32M-NEXT: li a1, 0 -; RV32M-NEXT: ret -; RV32M-NEXT: .LBB15_2: -; RV32M-NEXT: srli a0, a1, 1 -; RV32M-NEXT: or a0, a1, a0 -; RV32M-NEXT: srli a1, a0, 2 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 8 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 16 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: not a0, a0 -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: and a1, a1, a5 -; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: and a1, a0, a4 -; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a4 -; RV32M-NEXT: add a0, a1, a0 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: and a0, a0, a3 -; RV32M-NEXT: mul a0, a0, a2 -; RV32M-NEXT: srli a0, a0, 24 -; RV32M-NEXT: li a1, 0 -; RV32M-NEXT: ret +; RV32_NOZBB-LABEL: test_ctlz_i64_zero_undef: +; RV32_NOZBB: # %bb.0: +; RV32_NOZBB-NEXT: addi sp, sp, -16 +; RV32_NOZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32_NOZBB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32_NOZBB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32_NOZBB-NEXT: mv s1, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 1 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 2 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 4 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 16 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: not a0, a0 +; RV32_NOZBB-NEXT: call __popcountsi2 +; RV32_NOZBB-NEXT: mv s0, a0 +; RV32_NOZBB-NEXT: srli a0, s1, 1 +; RV32_NOZBB-NEXT: or a0, s1, a0 +; RV32_NOZBB-NEXT: srli a1, a0, 2 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 4 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 16 +; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: not a0, a0 +; RV32_NOZBB-NEXT: call __popcountsi2 +; RV32_NOZBB-NEXT: bnez s1, .LBB15_2 +; RV32_NOZBB-NEXT: # %bb.1: +; RV32_NOZBB-NEXT: addi a0, s0, 32 +; RV32_NOZBB-NEXT: .LBB15_2: +; RV32_NOZBB-NEXT: li a1, 0 +; RV32_NOZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32_NOZBB-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32_NOZBB-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32_NOZBB-NEXT: addi sp, sp, 16 +; RV32_NOZBB-NEXT: ret ; -; RV64M-LABEL: test_ctlz_i64_zero_undef: -; RV64M: # %bb.0: -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 349525 -; RV64M-NEXT: lui a3, 209715 -; RV64M-NEXT: lui a4, 61681 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: addiw a1, a2, 1365 -; RV64M-NEXT: addiw a2, a3, 819 -; RV64M-NEXT: addiw a3, a4, -241 -; RV64M-NEXT: srli a4, a0, 2 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: slli a4, a1, 32 -; RV64M-NEXT: add a1, a1, a4 -; RV64M-NEXT: slli a4, a2, 32 -; RV64M-NEXT: add a2, a2, a4 -; RV64M-NEXT: slli a4, a3, 32 -; RV64M-NEXT: add a3, a3, a4 -; RV64M-NEXT: srli a4, a0, 4 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: srli a4, a0, 8 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: srli a4, a0, 16 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: srli a4, a0, 32 -; RV64M-NEXT: or a0, a0, a4 -; RV64M-NEXT: not a0, a0 -; RV64M-NEXT: srli a4, a0, 1 -; RV64M-NEXT: and a1, a4, a1 -; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: and a1, a0, a2 -; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a2 -; RV64M-NEXT: lui a2, 4112 -; RV64M-NEXT: addiw a2, a2, 257 -; RV64M-NEXT: add a0, a1, a0 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: add a0, a0, a1 -; RV64M-NEXT: slli a1, a2, 32 -; RV64M-NEXT: and a0, a0, a3 -; RV64M-NEXT: add a1, a2, a1 -; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: srli a0, a0, 56 -; RV64M-NEXT: ret +; RV64NOZBB-LABEL: test_ctlz_i64_zero_undef: +; RV64NOZBB: # %bb.0: +; RV64NOZBB-NEXT: addi sp, sp, -16 +; RV64NOZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64NOZBB-NEXT: srli a1, a0, 1 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 2 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 4 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 8 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 16 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 32 +; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: not a0, a0 +; RV64NOZBB-NEXT: call __popcountdi2 +; RV64NOZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64NOZBB-NEXT: addi sp, sp, 16 +; RV64NOZBB-NEXT: ret ; ; RV32ZBB-LABEL: test_ctlz_i64_zero_undef: ; RV32ZBB: # %bb.0: @@ -2396,30 +1959,9 @@ define i16 @test_ctpop_i16(i16 %a) nounwind { } define i32 @test_ctpop_i32(i32 %a) nounwind { -; RV32I-LABEL: test_ctpop_i32: -; RV32I: # %bb.0: -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: addi a1, a2, -241 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: ret +; RV32_NOZBB-LABEL: test_ctpop_i32: +; RV32_NOZBB: # %bb.0: +; RV32_NOZBB-NEXT: tail __popcountsi2 ; ; RV64I-LABEL: test_ctpop_i32: ; RV64I: # %bb.0: @@ -2446,30 +1988,6 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: ret ; -; RV32M-LABEL: test_ctpop_i32: -; RV32M: # %bb.0: -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: lui a2, 349525 -; RV32M-NEXT: addi a2, a2, 1365 -; RV32M-NEXT: and a1, a1, a2 -; RV32M-NEXT: lui a2, 209715 -; RV32M-NEXT: addi a2, a2, 819 -; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: and a1, a0, a2 -; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: lui a2, 61681 -; RV32M-NEXT: add a0, a1, a0 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: lui a1, 4112 -; RV32M-NEXT: addi a2, a2, -241 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: addi a1, a1, 257 -; RV32M-NEXT: mul a0, a0, a1 -; RV32M-NEXT: srli a0, a0, 24 -; RV32M-NEXT: ret -; ; RV64M-LABEL: test_ctpop_i32: ; RV64M: # %bb.0: ; RV64M-NEXT: srli a1, a0, 1 @@ -2506,28 +2024,7 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; ; RV32XTHEADBB-LABEL: test_ctpop_i32: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: srli a1, a0, 1 -; RV32XTHEADBB-NEXT: lui a2, 349525 -; RV32XTHEADBB-NEXT: addi a2, a2, 1365 -; RV32XTHEADBB-NEXT: and a1, a1, a2 -; RV32XTHEADBB-NEXT: lui a2, 209715 -; RV32XTHEADBB-NEXT: addi a2, a2, 819 -; RV32XTHEADBB-NEXT: sub a0, a0, a1 -; RV32XTHEADBB-NEXT: and a1, a0, a2 -; RV32XTHEADBB-NEXT: srli a0, a0, 2 -; RV32XTHEADBB-NEXT: and a0, a0, a2 -; RV32XTHEADBB-NEXT: lui a2, 61681 -; RV32XTHEADBB-NEXT: add a0, a1, a0 -; RV32XTHEADBB-NEXT: srli a1, a0, 4 -; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: addi a1, a2, -241 -; RV32XTHEADBB-NEXT: and a0, a0, a1 -; RV32XTHEADBB-NEXT: slli a1, a0, 8 -; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: slli a1, a0, 16 -; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: srli a0, a0, 24 -; RV32XTHEADBB-NEXT: ret +; RV32XTHEADBB-NEXT: tail __popcountsi2 ; ; RV64XTHEADBB-LABEL: test_ctpop_i32: ; RV64XTHEADBB: # %bb.0: @@ -2558,150 +2055,24 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { } define i64 @test_ctpop_i64(i64 %a) nounwind { -; RV32I-LABEL: test_ctpop_i64: -; RV32I: # %bb.0: -; RV32I-NEXT: srli a2, a1, 1 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: srli a5, a0, 1 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: addi a5, a5, -241 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: sub a0, a0, a3 -; RV32I-NEXT: and a2, a1, a4 -; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a3, a0, a4 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: srli a2, a1, 4 -; RV32I-NEXT: srli a3, a0, 4 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: add a0, a0, a3 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: slli a2, a1, 8 -; RV32I-NEXT: slli a3, a0, 8 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: add a0, a0, a3 -; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: slli a3, a0, 16 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: add a0, a0, a3 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_ctpop_i64: -; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: slli a3, a1, 32 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: srli a3, a0, 1 -; RV64I-NEXT: and a1, a3, a1 -; RV64I-NEXT: lui a3, 61681 -; RV64I-NEXT: addiw a3, a3, -241 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: and a1, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a2, a3, 32 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: add a2, a3, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a1, a0, 8 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 16 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ret -; -; RV32M-LABEL: test_ctpop_i64: -; RV32M: # %bb.0: -; RV32M-NEXT: srli a2, a1, 1 -; RV32M-NEXT: lui a3, 349525 -; RV32M-NEXT: lui a4, 209715 -; RV32M-NEXT: lui a5, 61681 -; RV32M-NEXT: srli a6, a0, 1 -; RV32M-NEXT: addi a3, a3, 1365 -; RV32M-NEXT: and a2, a2, a3 -; RV32M-NEXT: and a3, a6, a3 -; RV32M-NEXT: lui a6, 4112 -; RV32M-NEXT: addi a4, a4, 819 -; RV32M-NEXT: addi a5, a5, -241 -; RV32M-NEXT: addi a6, a6, 257 -; RV32M-NEXT: sub a1, a1, a2 -; RV32M-NEXT: sub a0, a0, a3 -; RV32M-NEXT: and a2, a1, a4 -; RV32M-NEXT: srli a1, a1, 2 -; RV32M-NEXT: and a3, a0, a4 -; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a1, a1, a4 -; RV32M-NEXT: and a0, a0, a4 -; RV32M-NEXT: add a1, a2, a1 -; RV32M-NEXT: add a0, a3, a0 -; RV32M-NEXT: srli a2, a1, 4 -; RV32M-NEXT: srli a3, a0, 4 -; RV32M-NEXT: add a1, a1, a2 -; RV32M-NEXT: add a0, a0, a3 -; RV32M-NEXT: and a1, a1, a5 -; RV32M-NEXT: and a0, a0, a5 -; RV32M-NEXT: mul a1, a1, a6 -; RV32M-NEXT: mul a0, a0, a6 -; RV32M-NEXT: srli a1, a1, 24 -; RV32M-NEXT: srli a0, a0, 24 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: li a1, 0 -; RV32M-NEXT: ret +; RV32_NOZBB-LABEL: test_ctpop_i64: +; RV32_NOZBB: # %bb.0: +; RV32_NOZBB-NEXT: addi sp, sp, -16 +; RV32_NOZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32_NOZBB-NEXT: call __popcountdi2 +; RV32_NOZBB-NEXT: srai a1, a0, 31 +; RV32_NOZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32_NOZBB-NEXT: addi sp, sp, 16 +; RV32_NOZBB-NEXT: ret ; -; RV64M-LABEL: test_ctpop_i64: -; RV64M: # %bb.0: -; RV64M-NEXT: lui a1, 349525 -; RV64M-NEXT: lui a2, 209715 -; RV64M-NEXT: lui a3, 61681 -; RV64M-NEXT: addiw a1, a1, 1365 -; RV64M-NEXT: addiw a2, a2, 819 -; RV64M-NEXT: addiw a3, a3, -241 -; RV64M-NEXT: slli a4, a1, 32 -; RV64M-NEXT: add a1, a1, a4 -; RV64M-NEXT: slli a4, a2, 32 -; RV64M-NEXT: add a2, a2, a4 -; RV64M-NEXT: slli a4, a3, 32 -; RV64M-NEXT: add a3, a3, a4 -; RV64M-NEXT: srli a4, a0, 1 -; RV64M-NEXT: and a1, a4, a1 -; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: and a1, a0, a2 -; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a2 -; RV64M-NEXT: lui a2, 4112 -; RV64M-NEXT: addiw a2, a2, 257 -; RV64M-NEXT: add a0, a1, a0 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: add a0, a0, a1 -; RV64M-NEXT: slli a1, a2, 32 -; RV64M-NEXT: and a0, a0, a3 -; RV64M-NEXT: add a1, a2, a1 -; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: srli a0, a0, 56 -; RV64M-NEXT: ret +; RV64NOZBB-LABEL: test_ctpop_i64: +; RV64NOZBB: # %bb.0: +; RV64NOZBB-NEXT: addi sp, sp, -16 +; RV64NOZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64NOZBB-NEXT: call __popcountdi2 +; RV64NOZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64NOZBB-NEXT: addi sp, sp, 16 +; RV64NOZBB-NEXT: ret ; ; RV32ZBB-LABEL: test_ctpop_i64: ; RV32ZBB: # %bb.0: @@ -2718,77 +2089,21 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; ; RV32XTHEADBB-LABEL: test_ctpop_i64: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: srli a2, a1, 1 -; RV32XTHEADBB-NEXT: lui a3, 349525 -; RV32XTHEADBB-NEXT: lui a4, 209715 -; RV32XTHEADBB-NEXT: srli a5, a0, 1 -; RV32XTHEADBB-NEXT: addi a3, a3, 1365 -; RV32XTHEADBB-NEXT: and a2, a2, a3 -; RV32XTHEADBB-NEXT: and a3, a5, a3 -; RV32XTHEADBB-NEXT: lui a5, 61681 -; RV32XTHEADBB-NEXT: addi a4, a4, 819 -; RV32XTHEADBB-NEXT: addi a5, a5, -241 -; RV32XTHEADBB-NEXT: sub a1, a1, a2 -; RV32XTHEADBB-NEXT: sub a0, a0, a3 -; RV32XTHEADBB-NEXT: and a2, a1, a4 -; RV32XTHEADBB-NEXT: srli a1, a1, 2 -; RV32XTHEADBB-NEXT: and a3, a0, a4 -; RV32XTHEADBB-NEXT: srli a0, a0, 2 -; RV32XTHEADBB-NEXT: and a1, a1, a4 -; RV32XTHEADBB-NEXT: and a0, a0, a4 -; RV32XTHEADBB-NEXT: add a1, a2, a1 -; RV32XTHEADBB-NEXT: add a0, a3, a0 -; RV32XTHEADBB-NEXT: srli a2, a1, 4 -; RV32XTHEADBB-NEXT: srli a3, a0, 4 -; RV32XTHEADBB-NEXT: add a1, a1, a2 -; RV32XTHEADBB-NEXT: add a0, a0, a3 -; RV32XTHEADBB-NEXT: and a1, a1, a5 -; RV32XTHEADBB-NEXT: and a0, a0, a5 -; RV32XTHEADBB-NEXT: slli a2, a1, 8 -; RV32XTHEADBB-NEXT: slli a3, a0, 8 -; RV32XTHEADBB-NEXT: add a1, a1, a2 -; RV32XTHEADBB-NEXT: add a0, a0, a3 -; RV32XTHEADBB-NEXT: slli a2, a1, 16 -; RV32XTHEADBB-NEXT: slli a3, a0, 16 -; RV32XTHEADBB-NEXT: add a1, a1, a2 -; RV32XTHEADBB-NEXT: add a0, a0, a3 -; RV32XTHEADBB-NEXT: srli a1, a1, 24 -; RV32XTHEADBB-NEXT: srli a0, a0, 24 -; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: li a1, 0 +; RV32XTHEADBB-NEXT: addi sp, sp, -16 +; RV32XTHEADBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32XTHEADBB-NEXT: call __popcountdi2 +; RV32XTHEADBB-NEXT: srai a1, a0, 31 +; RV32XTHEADBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32XTHEADBB-NEXT: addi sp, sp, 16 ; RV32XTHEADBB-NEXT: ret ; ; RV64XTHEADBB-LABEL: test_ctpop_i64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: lui a1, 349525 -; RV64XTHEADBB-NEXT: lui a2, 209715 -; RV64XTHEADBB-NEXT: addiw a1, a1, 1365 -; RV64XTHEADBB-NEXT: addiw a2, a2, 819 -; RV64XTHEADBB-NEXT: slli a3, a1, 32 -; RV64XTHEADBB-NEXT: add a1, a1, a3 -; RV64XTHEADBB-NEXT: slli a3, a2, 32 -; RV64XTHEADBB-NEXT: add a2, a2, a3 -; RV64XTHEADBB-NEXT: srli a3, a0, 1 -; RV64XTHEADBB-NEXT: and a1, a3, a1 -; RV64XTHEADBB-NEXT: lui a3, 61681 -; RV64XTHEADBB-NEXT: addiw a3, a3, -241 -; RV64XTHEADBB-NEXT: sub a0, a0, a1 -; RV64XTHEADBB-NEXT: and a1, a0, a2 -; RV64XTHEADBB-NEXT: srli a0, a0, 2 -; RV64XTHEADBB-NEXT: and a0, a0, a2 -; RV64XTHEADBB-NEXT: slli a2, a3, 32 -; RV64XTHEADBB-NEXT: add a0, a1, a0 -; RV64XTHEADBB-NEXT: srli a1, a0, 4 -; RV64XTHEADBB-NEXT: add a0, a0, a1 -; RV64XTHEADBB-NEXT: add a2, a3, a2 -; RV64XTHEADBB-NEXT: and a0, a0, a2 -; RV64XTHEADBB-NEXT: slli a1, a0, 8 -; RV64XTHEADBB-NEXT: add a0, a0, a1 -; RV64XTHEADBB-NEXT: slli a1, a0, 16 -; RV64XTHEADBB-NEXT: add a0, a0, a1 -; RV64XTHEADBB-NEXT: slli a1, a0, 32 -; RV64XTHEADBB-NEXT: add a0, a0, a1 -; RV64XTHEADBB-NEXT: srli a0, a0, 56 +; RV64XTHEADBB-NEXT: addi sp, sp, -16 +; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64XTHEADBB-NEXT: call __popcountdi2 +; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64XTHEADBB-NEXT: addi sp, sp, 16 ; RV64XTHEADBB-NEXT: ret %1 = call i64 @llvm.ctpop.i64(i64 %a) ret i64 %1 diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll index 03a6a6b1c4b7d..d57c4d653b2ae 100644 --- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll +++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll @@ -602,14 +602,11 @@ define signext i32 @ctlz(i64 %b) nounwind { ; ; RV32I-LABEL: ctlz: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: lui a3, 209715 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: addi a3, a3, 819 -; RV32I-NEXT: addi a2, a5, -241 -; RV32I-NEXT: bnez a1, .LBB7_2 -; RV32I-NEXT: # %bb.1: # %entry +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 @@ -621,27 +618,10 @@ define signext i32 @ctlz(i64 %b) nounwind { ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi a0, a0, 32 -; RV32I-NEXT: andi a0, a0, 63 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srli a0, s1, 1 +; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -651,69 +631,39 @@ define signext i32 @ctlz(i64 %b) nounwind { ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: bnez s1, .LBB7_2 +; RV32I-NEXT: # %bb.1: # %entry +; RV32I-NEXT: addi a0, s0, 32 +; RV32I-NEXT: .LBB7_2: # %entry ; RV32I-NEXT: andi a0, a0, 63 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: ctlz: ; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: lui a3, 209715 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: addiw a1, a2, 1365 -; RV64I-NEXT: addiw a2, a3, 819 -; RV64I-NEXT: srli a3, a0, 2 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a3, a1, 32 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: srli a3, a0, 4 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 8 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 32 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a3, a0, 1 -; RV64I-NEXT: and a1, a3, a1 -; RV64I-NEXT: lui a3, 61681 -; RV64I-NEXT: addiw a3, a3, -241 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: and a1, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a2, a3, 32 -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: add a2, a3, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a1, a0, 8 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 16 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a0, a0, 2 -; RV64I-NEXT: srli a0, a0, 58 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: call __popcountdi2 +; RV64I-NEXT: andi a0, a0, 63 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/pr56457.ll b/llvm/test/CodeGen/RISCV/pr56457.ll index cf518b31a190b..19cc8b3af208f 100644 --- a/llvm/test/CodeGen/RISCV/pr56457.ll +++ b/llvm/test/CodeGen/RISCV/pr56457.ll @@ -9,46 +9,25 @@ define i15 @foo(i15 %x) nounwind { ; CHECK-NEXT: slli a1, a0, 49 ; CHECK-NEXT: beqz a1, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %cond.false +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; CHECK-NEXT: srli a1, a1, 50 -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: lui a3, 209715 -; CHECK-NEXT: lui a4, 61681 ; CHECK-NEXT: or a0, a0, a1 -; CHECK-NEXT: addiw a1, a2, 1365 -; CHECK-NEXT: addiw a2, a3, 819 -; CHECK-NEXT: addiw a3, a4, -241 -; CHECK-NEXT: slli a4, a2, 32 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: slli a4, a3, 32 -; CHECK-NEXT: add a3, a3, a4 -; CHECK-NEXT: slli a4, a0, 49 -; CHECK-NEXT: srli a4, a4, 51 -; CHECK-NEXT: or a0, a0, a4 -; CHECK-NEXT: slli a4, a0, 49 -; CHECK-NEXT: srli a4, a4, 53 -; CHECK-NEXT: or a0, a0, a4 -; CHECK-NEXT: slli a4, a0, 49 -; CHECK-NEXT: srli a4, a4, 57 -; CHECK-NEXT: or a0, a0, a4 +; CHECK-NEXT: slli a1, a0, 49 +; CHECK-NEXT: srli a1, a1, 51 +; CHECK-NEXT: or a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 49 +; CHECK-NEXT: srli a1, a1, 53 +; CHECK-NEXT: or a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 49 +; CHECK-NEXT: srli a1, a1, 57 +; CHECK-NEXT: or a0, a0, a1 ; CHECK-NEXT: not a0, a0 -; CHECK-NEXT: srli a4, a0, 1 -; CHECK-NEXT: and a1, a4, a1 ; CHECK-NEXT: slli a0, a0, 49 ; CHECK-NEXT: srli a0, a0, 49 -; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: and a1, a0, a2 -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: srli a1, a0, 4 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: lui a1, 4112 -; CHECK-NEXT: addiw a1, a1, 257 -; CHECK-NEXT: and a0, a0, a3 -; CHECK-NEXT: slli a2, a1, 32 -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: srli a0, a0, 56 +; CHECK-NEXT: call __popcountdi2 +; CHECK-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: li a0, 15 diff --git a/llvm/test/CodeGen/RISCV/pr95271.ll b/llvm/test/CodeGen/RISCV/pr95271.ll index aa941cb803627..46e9a196d6c59 100644 --- a/llvm/test/CodeGen/RISCV/pr95271.ll +++ b/llvm/test/CodeGen/RISCV/pr95271.ll @@ -6,29 +6,8 @@ define i32 @PR95271(ptr %p) { ; RV32I-LABEL: PR95271: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a0, 0(a0) -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: addi a0, a0, 1 -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: and a1, a2, a1 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: addi a1, a2, -241 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: ret +; RV32I-NEXT: tail __popcountsi2 ; ; RV64I-LABEL: PR95271: ; RV64I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll index 04a2f67c4942b..e783421e18769 100644 --- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll @@ -11,38 +11,22 @@ define i32 @ctlz_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: beqz a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: # %cond.false +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: addi a1, a2, 1365 -; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 8 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 16 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: and a1, a2, a1 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 2 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: addi a1, a2, -241 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 8 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: not a0, a0 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB0_2: ; RV32I-NEXT: li a0, 32 @@ -61,14 +45,11 @@ declare i64 @llvm.ctlz.i64(i64, i1) define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-LABEL: ctlz_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: lui a3, 209715 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: addi a3, a3, 819 -; RV32I-NEXT: addi a2, a5, -241 -; RV32I-NEXT: bnez a1, .LBB1_2 -; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 @@ -80,27 +61,10 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi a0, a0, 32 -; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB1_2: -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srli a0, s1, 1 +; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -110,22 +74,16 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: bnez s1, .LBB1_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: addi a0, s0, 32 +; RV32I-NEXT: .LBB1_2: ; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32XTHEADBB-LABEL: ctlz_i64: diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 98c86da41afa1..acfdff82d5a52 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -11,38 +11,22 @@ define i32 @ctlz_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: beqz a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: # %cond.false +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: addi a1, a2, 1365 -; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 8 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: srli a2, a0, 16 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: and a1, a2, a1 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 2 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: addi a1, a2, -241 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 8 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: not a0, a0 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB0_2: ; RV32I-NEXT: li a0, 32 @@ -61,14 +45,11 @@ declare i64 @llvm.ctlz.i64(i64, i1) define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-LABEL: ctlz_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: lui a3, 209715 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: addi a3, a3, 819 -; RV32I-NEXT: addi a2, a5, -241 -; RV32I-NEXT: bnez a1, .LBB1_2 -; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 @@ -80,27 +61,10 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi a0, a0, 32 -; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB1_2: -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srli a0, s1, 1 +; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -110,22 +74,16 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: bnez s1, .LBB1_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: addi a0, s0, 32 +; RV32I-NEXT: .LBB1_2: ; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctlz_i64: @@ -253,28 +211,7 @@ declare i32 @llvm.ctpop.i32(i32) define i32 @ctpop_i32(i32 %a) nounwind { ; RV32I-LABEL: ctpop_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: addi a1, a2, -241 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 8 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: ret +; RV32I-NEXT: tail __popcountsi2 ; ; RV32ZBB-LABEL: ctpop_i32: ; RV32ZBB: # %bb.0: @@ -365,42 +302,21 @@ declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind { ; RV32I-LABEL: ctpop_v2i32: ; RV32I: # %bb.0: -; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: addi a5, a5, -241 -; RV32I-NEXT: sub a0, a0, a2 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: and a2, a0, a4 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a3, a1, a4 -; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: srli a3, a1, 4 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: slli a3, a1, 8 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: slli a2, a0, 16 -; RV32I-NEXT: slli a3, a1, 16 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a1, a1, 24 +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __popcountsi2 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_v2i32: @@ -517,44 +433,12 @@ declare i64 @llvm.ctpop.i64(i64) define i64 @ctpop_i64(i64 %a) nounwind { ; RV32I-LABEL: ctpop_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: srli a2, a1, 1 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: srli a5, a0, 1 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: addi a5, a5, -241 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: sub a0, a0, a3 -; RV32I-NEXT: and a2, a1, a4 -; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a3, a0, a4 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: srli a2, a1, 4 -; RV32I-NEXT: srli a3, a0, 4 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: add a0, a0, a3 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: slli a2, a1, 8 -; RV32I-NEXT: slli a3, a0, 8 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: add a0, a0, a3 -; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: slli a3, a0, 16 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: add a0, a0, a3 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: call __popcountdi2 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_i64: @@ -682,82 +566,38 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV32I-LABEL: ctpop_v2i64: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a2, 8(a1) -; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: lui a5, 349525 -; RV32I-NEXT: addi a5, a5, 1365 -; RV32I-NEXT: srli a6, a4, 1 -; RV32I-NEXT: srli a7, a3, 1 -; RV32I-NEXT: srli t0, a1, 1 -; RV32I-NEXT: srli t1, a2, 1 -; RV32I-NEXT: and a6, a6, a5 -; RV32I-NEXT: and a7, a7, a5 -; RV32I-NEXT: and t0, t0, a5 -; RV32I-NEXT: and a5, t1, a5 -; RV32I-NEXT: lui t1, 209715 -; RV32I-NEXT: addi t1, t1, 819 -; RV32I-NEXT: sub a4, a4, a6 -; RV32I-NEXT: sub a3, a3, a7 -; RV32I-NEXT: sub a1, a1, t0 -; RV32I-NEXT: sub a2, a2, a5 -; RV32I-NEXT: and a5, a4, t1 -; RV32I-NEXT: srli a4, a4, 2 -; RV32I-NEXT: and a6, a3, t1 -; RV32I-NEXT: srli a3, a3, 2 -; RV32I-NEXT: and a7, a1, t1 -; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and t0, a2, t1 -; RV32I-NEXT: srli a2, a2, 2 -; RV32I-NEXT: and a4, a4, t1 -; RV32I-NEXT: and a3, a3, t1 -; RV32I-NEXT: and a1, a1, t1 -; RV32I-NEXT: and a2, a2, t1 -; RV32I-NEXT: add a4, a5, a4 -; RV32I-NEXT: lui a5, 61681 -; RV32I-NEXT: addi a5, a5, -241 -; RV32I-NEXT: add a3, a6, a3 -; RV32I-NEXT: add a1, a7, a1 -; RV32I-NEXT: add a2, t0, a2 -; RV32I-NEXT: srli a6, a4, 4 -; RV32I-NEXT: srli a7, a3, 4 -; RV32I-NEXT: srli t0, a1, 4 -; RV32I-NEXT: add a4, a4, a6 -; RV32I-NEXT: srli a6, a2, 4 -; RV32I-NEXT: add a3, a3, a7 -; RV32I-NEXT: add a1, a1, t0 -; RV32I-NEXT: add a2, a2, a6 -; RV32I-NEXT: and a4, a4, a5 -; RV32I-NEXT: and a3, a3, a5 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: and a2, a2, a5 -; RV32I-NEXT: slli a5, a4, 8 -; RV32I-NEXT: slli a6, a3, 8 -; RV32I-NEXT: slli a7, a1, 8 -; RV32I-NEXT: slli t0, a2, 8 -; RV32I-NEXT: add a4, a4, a5 -; RV32I-NEXT: add a3, a3, a6 -; RV32I-NEXT: add a1, a1, a7 -; RV32I-NEXT: add a2, a2, t0 -; RV32I-NEXT: slli a5, a4, 16 -; RV32I-NEXT: slli a6, a3, 16 -; RV32I-NEXT: slli a7, a1, 16 -; RV32I-NEXT: slli t0, a2, 16 -; RV32I-NEXT: add a4, a4, a5 -; RV32I-NEXT: add a3, a3, a6 -; RV32I-NEXT: add a1, a1, a7 -; RV32I-NEXT: add a2, a2, t0 -; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: srli a3, a3, 24 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: srli a2, a2, 24 -; RV32I-NEXT: add a3, a3, a4 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw zero, 4(a0) -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw zero, 12(a0) +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw s0, 8(a1) +; RV32I-NEXT: lw s1, 12(a1) +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: call __popcountdi2 +; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: srai s4, a0, 31 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __popcountdi2 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: sw s3, 0(s2) +; RV32I-NEXT: sw s4, 4(s2) +; RV32I-NEXT: sw a0, 8(s2) +; RV32I-NEXT: sw a1, 12(s2) +; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_v2i64: diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index d9f7d36127293..a11bc09954062 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -295,48 +295,24 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: lui a3, 209715 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: addiw a1, a2, 1365 -; RV64I-NEXT: addiw a2, a3, 819 -; RV64I-NEXT: srli a3, a0, 2 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a3, a1, 32 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: srli a3, a0, 4 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 8 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 32 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a3, a0, 1 -; RV64I-NEXT: and a1, a3, a1 -; RV64I-NEXT: lui a3, 61681 -; RV64I-NEXT: addiw a3, a3, -241 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: and a1, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a2, a3, 32 -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: add a2, a3, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a1, a0, 8 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 16 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: call __popcountdi2 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB5_2: ; RV64I-NEXT: li a0, 64 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index 17eb0817d548a..bb7078461c244 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -285,48 +285,24 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: lui a3, 209715 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: addiw a1, a2, 1365 -; RV64I-NEXT: addiw a2, a3, 819 -; RV64I-NEXT: srli a3, a0, 2 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a3, a1, 32 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: srli a3, a0, 4 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 8 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: srli a3, a0, 32 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a3, a0, 1 -; RV64I-NEXT: and a1, a3, a1 -; RV64I-NEXT: lui a3, 61681 -; RV64I-NEXT: addiw a3, a3, -241 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: and a1, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a2, a3, 32 -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: add a2, a3, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a1, a0, 8 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 16 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: call __popcountdi2 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB5_2: ; RV64I-NEXT: li a0, 64 @@ -828,35 +804,11 @@ declare i64 @llvm.ctpop.i64(i64) define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I-LABEL: ctpop_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: slli a3, a1, 32 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: srli a3, a0, 1 -; RV64I-NEXT: and a1, a3, a1 -; RV64I-NEXT: lui a3, 61681 -; RV64I-NEXT: addiw a3, a3, -241 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: and a1, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a2, a3, 32 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: add a2, a3, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a1, a0, 8 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 16 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: call __popcountdi2 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_i64: @@ -948,52 +900,21 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV64I-LABEL: ctpop_v2i64: ; RV64I: # %bb.0: -; RV64I-NEXT: srli a2, a0, 1 -; RV64I-NEXT: lui a3, 349525 -; RV64I-NEXT: lui a4, 209715 -; RV64I-NEXT: lui a5, 61681 -; RV64I-NEXT: addiw a3, a3, 1365 -; RV64I-NEXT: addiw a4, a4, 819 -; RV64I-NEXT: addiw a5, a5, -241 -; RV64I-NEXT: slli a6, a3, 32 -; RV64I-NEXT: add a3, a3, a6 -; RV64I-NEXT: slli a6, a4, 32 -; RV64I-NEXT: add a4, a4, a6 -; RV64I-NEXT: slli a6, a5, 32 -; RV64I-NEXT: add a5, a5, a6 -; RV64I-NEXT: srli a6, a1, 1 -; RV64I-NEXT: and a2, a2, a3 -; RV64I-NEXT: and a3, a6, a3 -; RV64I-NEXT: sub a0, a0, a2 -; RV64I-NEXT: sub a1, a1, a3 -; RV64I-NEXT: and a2, a0, a4 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a3, a1, a4 -; RV64I-NEXT: srli a1, a1, 2 -; RV64I-NEXT: and a0, a0, a4 -; RV64I-NEXT: and a1, a1, a4 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: add a1, a3, a1 -; RV64I-NEXT: srli a2, a0, 4 -; RV64I-NEXT: srli a3, a1, 4 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: and a0, a0, a5 -; RV64I-NEXT: and a1, a1, a5 -; RV64I-NEXT: slli a2, a0, 8 -; RV64I-NEXT: slli a3, a1, 8 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: slli a2, a0, 32 -; RV64I-NEXT: slli a3, a1, 32 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: add a1, a1, a3 -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: srli a1, a1, 56 +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: call __popcountdi2 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __popcountdi2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_v2i64: diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll index 49494608eee4d..4494d9b8b5691 100644 --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -316,52 +316,18 @@ declare float @baz(i32 signext %i3) define void @test7(i32 signext %arg, i32 signext %arg1) nounwind { ; RV64I-LABEL: test7: ; RV64I: # %bb.0: # %bb -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sraw a0, a0, a1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: lui a3, 61681 -; RV64I-NEXT: lui a4, 4112 -; RV64I-NEXT: addiw s0, a1, 1365 -; RV64I-NEXT: addiw s1, a2, 819 -; RV64I-NEXT: addiw s2, a3, -241 -; RV64I-NEXT: addiw s3, a4, 257 -; RV64I-NEXT: slli a1, s0, 32 -; RV64I-NEXT: add s0, s0, a1 -; RV64I-NEXT: slli a1, s1, 32 -; RV64I-NEXT: add s1, s1, a1 -; RV64I-NEXT: slli a1, s2, 32 -; RV64I-NEXT: add s2, s2, a1 -; RV64I-NEXT: slli a1, s3, 32 -; RV64I-NEXT: add s3, s3, a1 ; RV64I-NEXT: .LBB6_1: # %bb2 ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: call foo -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: and a1, a1, s0 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: and a1, a0, s1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, s1 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: and a0, a0, s2 -; RV64I-NEXT: mul a0, a0, s3 -; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: call __popcountdi2 ; RV64I-NEXT: bnez a0, .LBB6_1 ; RV64I-NEXT: # %bb.2: # %bb7 -; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: test7: diff --git a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll index 724bd4f7963b8..9f89b2c495659 100644 --- a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autoenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){ @@ -7,56 +6,20 @@ define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: mov.w lr, #1431655765 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: mov.w r12, #858993459 -; CHECK-NEXT: vldr s1, .LCPI0_0 -; CHECK-NEXT: vmov.f32 s3, s1 -; CHECK-NEXT: and.w r0, lr, r2, lsr #1 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: and.w r2, r12, r0, lsr #2 -; CHECK-NEXT: bic r0, r0, #-858993460 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: and.w r2, lr, r1, lsr #1 -; CHECK-NEXT: subs r1, r1, r2 -; CHECK-NEXT: add.w r0, r0, r0, lsr #4 -; CHECK-NEXT: and.w r2, r12, r1, lsr #2 -; CHECK-NEXT: bic r1, r1, #-858993460 -; CHECK-NEXT: add r1, r2 -; CHECK-NEXT: and.w r2, lr, r3, lsr #1 -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: bic r5, r0, #-252645136 -; CHECK-NEXT: add.w r1, r1, r1, lsr #4 -; CHECK-NEXT: mov.w r0, #16843009 -; CHECK-NEXT: and.w r3, r12, r2, lsr #2 -; CHECK-NEXT: bic r2, r2, #-858993460 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: and.w r3, lr, r4, lsr #1 -; CHECK-NEXT: subs r3, r4, r3 -; CHECK-NEXT: bic r1, r1, #-252645136 -; CHECK-NEXT: add.w r2, r2, r2, lsr #4 -; CHECK-NEXT: muls r5, r0, r5 -; CHECK-NEXT: and.w r4, r12, r3, lsr #2 -; CHECK-NEXT: bic r3, r3, #-858993460 -; CHECK-NEXT: bic r2, r2, #-252645136 -; CHECK-NEXT: add r3, r4 -; CHECK-NEXT: muls r1, r0, r1 -; CHECK-NEXT: add.w r3, r3, r3, lsr #4 -; CHECK-NEXT: muls r2, r0, r2 -; CHECK-NEXT: bic r3, r3, #-252645136 -; CHECK-NEXT: muls r0, r3, r0 -; CHECK-NEXT: lsrs r1, r1, #24 -; CHECK-NEXT: add.w r1, r1, r5, lsr #24 -; CHECK-NEXT: lsrs r2, r2, #24 -; CHECK-NEXT: vmov s2, r1 -; CHECK-NEXT: add.w r0, r2, r0, lsr #24 -; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: bl __popcountdi2 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: asrs r5, r4, #31 +; CHECK-NEXT: bl __popcountdi2 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %0 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %src) ret <2 x i64> %0 From 8e9ff8ea51b5a734df1314bd87ddb8dab31c2fbd Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Wed, 23 Apr 2025 02:47:24 -0700 Subject: [PATCH 038/245] [mlir][tosa] Align Variable ops to match with TOSA v1.0 spec (#130680) - updated AnyType:$value to Tosa_Tensor:$input1 and Tosa_Tensor:$output1 for VariableWrite and VriableRead Operators - updated description discrepancies - note: in the TOSA spec, we had var_shape attr, but it's already included in the TypeAttr:$type in MLIR Signed-off-by: Jerry Ge --- mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td | 16 ++++++++-------- .../TosaToMLProgram/TosaToMLProgram.cpp | 2 +- .../Tosa/Transforms/TosaProfileCompliance.cpp | 8 +++++++- .../Dialect/Tosa/Transforms/TosaValidation.cpp | 5 ++--- .../TosaToLinalg/tosa-to-linalg-pipeline.mlir | 4 ++-- .../TosaToMLProgram/tosa-to-mlprogram.mlir | 4 ++-- mlir/test/Dialect/Tosa/invalid.mlir | 16 ++++++++-------- mlir/test/Dialect/Tosa/invalid_extension.mlir | 10 +++++----- mlir/test/Dialect/Tosa/level_check.mlir | 16 ++++++++-------- mlir/test/Dialect/Tosa/variables.mlir | 16 ++++++++-------- 10 files changed, 51 insertions(+), 46 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td index 3b2ede1b1a1a2..0ab0a62f1cf11 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td @@ -109,9 +109,9 @@ def Tosa_VariableOp : Tosa_Op<"variable", []> { } //===----------------------------------------------------------------------===// -// Operator: variable.write +// Operator: variable_write //===----------------------------------------------------------------------===// -def Tosa_VariableWriteOp : Tosa_Op<"variable.write", []> { +def Tosa_VariableWriteOp : Tosa_Op<"variable_write", []> { let summary = "write_buffer operator"; let description = [{ @@ -120,7 +120,7 @@ def Tosa_VariableWriteOp : Tosa_Op<"variable.write", []> { let arguments = (ins SymbolNameAttr:$name, - AnyType:$value + Tosa_Tensor:$input1 ); list availability = [ @@ -129,14 +129,14 @@ def Tosa_VariableWriteOp : Tosa_Op<"variable.write", []> { ]; let assemblyFormat = [{ - $name attr-dict `,` $value `:` type($value) + $name attr-dict `,` $input1 `:` type($input1) }]; } //===----------------------------------------------------------------------===// -// Operator: variable.read +// Operator: variable_read //===----------------------------------------------------------------------===// -def Tosa_VariableReadOp : Tosa_Op<"variable.read", []> { +def Tosa_VariableReadOp : Tosa_Op<"variable_read", []> { let summary = "read_buffer operator"; let description = [{ @@ -148,7 +148,7 @@ def Tosa_VariableReadOp : Tosa_Op<"variable.read", []> { ); let results = (outs - AnyType:$value + Tosa_Tensor:$output1 ); list availability = [ @@ -157,7 +157,7 @@ def Tosa_VariableReadOp : Tosa_Op<"variable.read", []> { ]; let assemblyFormat = [{ - $name attr-dict `:` type($value) + $name attr-dict `:` type($output1) }]; } diff --git a/mlir/lib/Conversion/TosaToMLProgram/TosaToMLProgram.cpp b/mlir/lib/Conversion/TosaToMLProgram/TosaToMLProgram.cpp index d134d8cdf485e..310566e692202 100644 --- a/mlir/lib/Conversion/TosaToMLProgram/TosaToMLProgram.cpp +++ b/mlir/lib/Conversion/TosaToMLProgram/TosaToMLProgram.cpp @@ -45,7 +45,7 @@ class VariableWriteOpConverter auto globalSymbolRef = SymbolRefAttr::get(rewriter.getContext(), op.getName()); auto newVariableWrite = rewriter.create( - op.getLoc(), globalSymbolRef, op.getValue()); + op.getLoc(), globalSymbolRef, op.getInput1()); rewriter.replaceOp(op, newVariableWrite); return success(); } diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp index 8f62253796175..3938c3731c47f 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp @@ -226,6 +226,12 @@ LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::VariableOp op) { return failure(); } +template <> +LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::VariableWriteOp op) { + addValue(op.getInput1()); + return success(); +} + template <> LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::IfOp op) { addValue(op.getCondition()); @@ -280,6 +286,7 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) { POPULATE_PROFILE_INFO_CUSTOM(Rescale) POPULATE_PROFILE_INFO_CUSTOM(MatMul) POPULATE_PROFILE_INFO_CUSTOM(Variable) + POPULATE_PROFILE_INFO_CUSTOM(VariableWrite) POPULATE_PROFILE_INFO_CUSTOM(If) POPULATE_PROFILE_INFO_CUSTOM(While) @@ -334,7 +341,6 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) { POPULATE_PROFILE_INFO_COMMON(Reverse) POPULATE_PROFILE_INFO_COMMON(Identity) POPULATE_PROFILE_INFO_COMMON(VariableRead) - POPULATE_PROFILE_INFO_COMMON(VariableWrite) // Type Invariant Extension, a capability extension that is independent // of the data type, meaning any compatible type can be used. No type diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp index ef9d27f8df0ad..baa202833e285 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp @@ -767,7 +767,7 @@ inline bool CompatibleTypes(const mlir::Type &type, bool TosaValidation::CheckVariable(Operation *op) { if (isa(op)) { - auto nameAttr = cast(op->getAttr("name")); + mlir::StringAttr nameAttr = cast(op->getAttr("name")); if (variablesMap.count(nameAttr)) { op->emitOpError() << "name has already been declared"; @@ -786,8 +786,7 @@ bool TosaValidation::CheckVariable(Operation *op) { bool TosaValidation::CheckVariableReadOrWrite(Operation *op) { if (isa(op) || isa(op)) { - auto nameAttr = cast(op->getAttr("name")); - + mlir::StringAttr nameAttr = cast(op->getAttr("name")); if (!variablesMap.count(nameAttr)) { op->emitOpError() << "name has not been declared"; return false; diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir index 731e134ed1a07..37ed5cec00a0d 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir @@ -6,8 +6,8 @@ // check that -tosa-validate of stateful ops kick in func.func @test_variable_write_shape(%arg0: tensor<1x4x8xi8>) -> () { tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8> - // expected-error@+1 {{'tosa.variable.write' op operand type does not equal variable type}} - tosa.variable.write @stored_var, %arg0 : tensor<1x4x8xi8> + // expected-error@+1 {{'tosa.variable_write' op operand type does not equal variable type}} + tosa.variable_write @stored_var, %arg0 : tensor<1x4x8xi8> return } diff --git a/mlir/test/Conversion/TosaToMLProgram/tosa-to-mlprogram.mlir b/mlir/test/Conversion/TosaToMLProgram/tosa-to-mlprogram.mlir index 69b6875987daf..365b05ff084da 100644 --- a/mlir/test/Conversion/TosaToMLProgram/tosa-to-mlprogram.mlir +++ b/mlir/test/Conversion/TosaToMLProgram/tosa-to-mlprogram.mlir @@ -5,9 +5,9 @@ module { tosa.variable @var_x = dense<7.000000e+00> : tensor<1xf32> func.func @test_stateful_ops(%arg0: tensor<1xf32>) -> (tensor<1xf32>) { // CHECK: ml_program.global_store @var_x = %arg0 : tensor<1xf32> - tosa.variable.write @var_x, %arg0 : tensor<1xf32> + tosa.variable_write @var_x, %arg0 : tensor<1xf32> // CHECK: %[[LOAD:.+]] = ml_program.global_load @var_x : tensor<1xf32> - %0 = tosa.variable.read @var_x : tensor<1xf32> + %0 = tosa.variable_read @var_x : tensor<1xf32> return %0 : tensor<1xf32> } } \ No newline at end of file diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index fc98aa95ed5b3..269ed58fdc81c 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -626,8 +626,8 @@ func.func @test_variable_duplicates(%arg0: tensor<2x4x8xi8>) -> () { func.func @test_variable_read_type(%arg0: tensor<2x4x8xi8>) -> () { tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8> - // expected-error@+1 {{'tosa.variable.read' op result type does not equal variable type}} - %0 = tosa.variable.read @stored_var : tensor<2x4x8xi16> + // expected-error@+1 {{'tosa.variable_read' op illegal: operand/result data types not supported}} + %0 = tosa.variable_read @stored_var : tensor<2x4x8xi16> return } @@ -635,8 +635,8 @@ func.func @test_variable_read_type(%arg0: tensor<2x4x8xi8>) -> () { func.func @test_variable_read_shape(%arg0: tensor<2x4x8xi8>) -> () { tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8> - // expected-error@+1 {{'tosa.variable.read' op result type does not equal variable type}} - %0 = tosa.variable.read @stored_var : tensor<1x4x8xi32> + // expected-error@+1 {{'tosa.variable_read' op illegal: operand/result data types not supported}} + %0 = tosa.variable_read @stored_var : tensor<1x4x8xi32> return } @@ -644,8 +644,8 @@ func.func @test_variable_read_shape(%arg0: tensor<2x4x8xi8>) -> () { func.func @test_variable_write_type(%arg0: tensor<2x4x8xi16>) -> () { tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8> - // expected-error@+1 {{'tosa.variable.write' op operand type does not equal variable type}} - tosa.variable.write @stored_var, %arg0 : tensor<2x4x8xi16> + // expected-error@+1 {{'tosa.variable_write' op illegal: operand/result data types not supported}} + tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xi16> return } @@ -653,8 +653,8 @@ func.func @test_variable_write_type(%arg0: tensor<2x4x8xi16>) -> () { func.func @test_variable_write_shape(%arg0: tensor<1x4x8xi8>) -> () { tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8> - // expected-error@+1 {{'tosa.variable.write' op operand type does not equal variable type}} - tosa.variable.write @stored_var, %arg0 : tensor<1x4x8xi8> + // expected-error@+1 {{'tosa.variable_write' op operand type does not equal variable type}} + tosa.variable_write @stored_var, %arg0 : tensor<1x4x8xi8> return } diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir index 7386b1ba9df99..bb0d3b46955a1 100644 --- a/mlir/test/Dialect/Tosa/invalid_extension.mlir +++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir @@ -313,17 +313,17 @@ func.func @test_identity(%arg0: tensor<13x21x3xi4>) -> tensor<13x21x3xi4> { func.func @test_variable_read_type(%arg0: tensor<2x4x8xi8>) -> () { // expected-error@+1 {{'tosa.variable' op illegal: requires [variable] but not enabled in target}} tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8> - // expected-error@+1 {{'tosa.variable.read' op illegal: requires [variable]}} - %0 = tosa.variable.read @stored_var : tensor<2x4x8xi16> + // expected-error@+1 {{'tosa.variable_read' op illegal: requires [variable]}} + %0 = tosa.variable_read @stored_var : tensor<2x4x8xi8> return } // ----- -func.func @test_variable_write_type(%arg0: tensor<2x4x8xi16>) -> () { +func.func @test_variable_write_type(%arg0: tensor<2x4x8xi8>) -> () { // expected-error@+1 {{'tosa.variable' op illegal: requires [variable] but not enabled in target}} tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8> - // expected-error@+1 {{'tosa.variable.write' op illegal: requires [variable]}} - tosa.variable.write @stored_var, %arg0 : tensor<2x4x8xi16> + // expected-error@+1 {{'tosa.variable_write' op illegal: requires [variable]}} + tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xi8> return } diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir index b48f614770fcb..8d91142c678fe 100644 --- a/mlir/test/Dialect/Tosa/level_check.mlir +++ b/mlir/test/Dialect/Tosa/level_check.mlir @@ -1089,10 +1089,10 @@ func.func @test_scatter_tensor_size_invalid(%arg0: tensor<13x210000000x3xf32>, % func.func @test_variable_read_write_tensor_size_invalid() -> () { tosa.variable @stored_var = dense<3.14> : tensor<536870912xf32> - // expected-error@+1 {{'tosa.variable.read' op failed level check: result tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}} - %0 = tosa.variable.read @stored_var : tensor<536870912xf32> - // expected-error@+1 {{'tosa.variable.write' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}} - tosa.variable.write @stored_var, %0 : tensor<536870912xf32> + // expected-error@+1 {{'tosa.variable_read' op failed level check: result tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}} + %0 = tosa.variable_read @stored_var : tensor<536870912xf32> + // expected-error@+1 {{'tosa.variable_write' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}} + tosa.variable_write @stored_var, %0 : tensor<536870912xf32> return } @@ -1157,10 +1157,10 @@ func.func @test_cond_if_rank_invalid(%arg0: tensor<1x1x1x1x1x1x1x1xf32>, %arg1: func.func @test_variable_read_write_rank_invalid() -> () { // expected-error@+1 {{'tosa.variable' op failed level check: attribute rank(shape) <= MAX_RANK}} tosa.variable @stored_var = dense<3.14> : tensor<1x1x1x1x1x1x1x1xf32> - // expected-error@+1 {{'tosa.variable.read' op failed level check: result rank(shape) <= MAX_RANK}} - %0 = tosa.variable.read @stored_var : tensor<1x1x1x1x1x1x1x1xf32> - // expected-error@+1 {{'tosa.variable.write' op failed level check: operand rank(shape) <= MAX_RANK}} - tosa.variable.write @stored_var, %0 : tensor<1x1x1x1x1x1x1x1xf32> + // expected-error@+1 {{'tosa.variable_read' op failed level check: result rank(shape) <= MAX_RANK}} + %0 = tosa.variable_read @stored_var : tensor<1x1x1x1x1x1x1x1xf32> + // expected-error@+1 {{'tosa.variable_write' op failed level check: operand rank(shape) <= MAX_RANK}} + tosa.variable_write @stored_var, %0 : tensor<1x1x1x1x1x1x1x1xf32> return } diff --git a/mlir/test/Dialect/Tosa/variables.mlir b/mlir/test/Dialect/Tosa/variables.mlir index 9a26aa0bc8bf4..6fa6b26155461 100644 --- a/mlir/test/Dialect/Tosa/variables.mlir +++ b/mlir/test/Dialect/Tosa/variables.mlir @@ -8,12 +8,12 @@ func.func @test_variable_scalar(%arg0: tensor) -> () { // CHECK: tosa.variable @stored_var = dense<3.140000e+00> : tensor tosa.variable @stored_var = dense<3.14> : tensor - // CHECK: %[[STORED_VAL:.*]] = tosa.variable.read @stored_var : tensor - %0 = tosa.variable.read @stored_var : tensor + // CHECK: %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor + %0 = tosa.variable_read @stored_var : tensor // CHECK: %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor, tensor) -> tensor %1 = "tosa.add"(%arg0, %0) : (tensor, tensor) -> tensor - // CHECK: tosa.variable.write @stored_var, %[[RESULT_ADD]] : tensor - tosa.variable.write @stored_var, %1 : tensor + // CHECK: tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor + tosa.variable_write @stored_var, %1 : tensor return } @@ -23,11 +23,11 @@ func.func @test_variable_scalar(%arg0: tensor) -> () { func.func @test_variable_tensor(%arg0: tensor<2x4x8xi32>) -> () { // CHECK: tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32> tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32> - // CHECK: %[[STORED_VAL:.*]] = tosa.variable.read @stored_var : tensor<2x4x8xi32> - %0 = tosa.variable.read @stored_var : tensor<2x4x8xi32> + // CHECK: %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<2x4x8xi32> + %0 = tosa.variable_read @stored_var : tensor<2x4x8xi32> // CHECK: %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32> %1 = "tosa.add"(%arg0, %0) : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32> - // CHECK: tosa.variable.write @stored_var, %[[RESULT_ADD]] : tensor<2x4x8xi32> - tosa.variable.write @stored_var, %1 : tensor<2x4x8xi32> + // CHECK: tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<2x4x8xi32> + tosa.variable_write @stored_var, %1 : tensor<2x4x8xi32> return } From a7999f3fba49b7b5da08afb070841f792ea1c796 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 17 Apr 2025 15:58:39 +0100 Subject: [PATCH 039/245] [NFC][AArch64TTI] Refactor instCombineSVEVectorMul into simplifySVEIntrinsicBinOp. --- .../AArch64/AArch64TargetTransformInfo.cpp | 134 +++++++++--------- 1 file changed, 65 insertions(+), 69 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 324e234db6120..c670b2ae71bf3 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1497,6 +1497,65 @@ static bool isAllActivePredicate(Value *Pred) { return (C && C->isAllOnesValue()); } +// Simplify `V` by only considering the operations that affect active lanes. +// This function should only return existing Values or newly created Constants. +static Value *stripInactiveLanes(Value *V, const Value *Pg) { + auto *Dup = dyn_cast(V); + if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup && + Dup->getOperand(1) == Pg && isa(Dup->getOperand(2))) + return ConstantVector::getSplat( + cast(V->getType())->getElementCount(), + cast(Dup->getOperand(2))); + + return V; +} + +static std::optional +simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, + const SVEIntrinsicInfo &IInfo) { + const unsigned Opc = IInfo.getMatchingIROpode(); + assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!"); + + Value *Pg = II.getOperand(0); + Value *Op1 = II.getOperand(1); + Value *Op2 = II.getOperand(2); + const DataLayout &DL = II.getDataLayout(); + + // Canonicalise constants to the RHS. + if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() && + isa(Op1) && !isa(Op2)) { + IC.replaceOperand(II, 1, Op2); + IC.replaceOperand(II, 2, Op1); + return &II; + } + + // Only active lanes matter when simplifying the operation. + Op1 = stripInactiveLanes(Op1, Pg); + Op2 = stripInactiveLanes(Op2, Pg); + + Value *SimpleII; + if (auto FII = dyn_cast(&II)) + SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL); + else + SimpleII = simplifyBinOp(Opc, Op1, Op2, DL); + + if (!SimpleII) + return std::nullopt; + + if (IInfo.inactiveLanesAreNotDefined()) + return IC.replaceInstUsesWith(II, SimpleII); + + Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()); + + // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)). + if (SimpleII == Inactive) + return IC.replaceInstUsesWith(II, SimpleII); + + // Inactive lanes must be preserved. + SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive); + return IC.replaceInstUsesWith(II, SimpleII); +} + // Use SVE intrinsic info to eliminate redundant operands and/or canonicalise // to operations with less strict inactive lane requirements. static std::optional @@ -1537,6 +1596,11 @@ simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, } } + // Operation specific simplifications. + if (IInfo.hasMatchingIROpode() && + Instruction::isBinaryOp(IInfo.getMatchingIROpode())) + return simplifySVEIntrinsicBinOp(IC, II, IInfo); + return std::nullopt; } @@ -2220,68 +2284,6 @@ static std::optional instCombineSVEVectorSub(InstCombiner &IC, return std::nullopt; } -// Simplify `V` by only considering the operations that affect active lanes. -// This function should only return existing Values or newly created Constants. -static Value *stripInactiveLanes(Value *V, const Value *Pg) { - auto *Dup = dyn_cast(V); - if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup && - Dup->getOperand(1) == Pg && isa(Dup->getOperand(2))) - return ConstantVector::getSplat( - cast(V->getType())->getElementCount(), - cast(Dup->getOperand(2))); - - return V; -} - -static std::optional -instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, - const SVEIntrinsicInfo &IInfo) { - const unsigned Opc = IInfo.getMatchingIROpode(); - if (!Instruction::isBinaryOp(Opc)) - return std::nullopt; - - Value *Pg = II.getOperand(0); - Value *Op1 = II.getOperand(1); - Value *Op2 = II.getOperand(2); - const DataLayout &DL = II.getDataLayout(); - - // Canonicalise constants to the RHS. - if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() && - isa(Op1) && !isa(Op2)) { - IC.replaceOperand(II, 1, Op2); - IC.replaceOperand(II, 2, Op1); - return &II; - } - - // Only active lanes matter when simplifying the operation. - Op1 = stripInactiveLanes(Op1, Pg); - Op2 = stripInactiveLanes(Op2, Pg); - - Value *SimpleII; - if (auto FII = dyn_cast(&II)) - SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL); - else - SimpleII = simplifyBinOp(Opc, Op1, Op2, DL); - - if (SimpleII) { - if (IInfo.inactiveLanesAreNotDefined()) - return IC.replaceInstUsesWith(II, SimpleII); - - Value *Inactive = - II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()); - - // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)). - if (SimpleII == Inactive) - return IC.replaceInstUsesWith(II, SimpleII); - - // Inactive lanes must be preserved. - SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive); - return IC.replaceInstUsesWith(II, SimpleII); - } - - return instCombineSVEVectorBinOp(IC, II); -} - static std::optional instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II) { Value *UnpackArg = II.getArgOperand(0); @@ -2689,10 +2691,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return instCombineSVEVectorFAdd(IC, II); case Intrinsic::aarch64_sve_fadd_u: return instCombineSVEVectorFAddU(IC, II); - case Intrinsic::aarch64_sve_fmul: - return instCombineSVEVectorMul(IC, II, IInfo); case Intrinsic::aarch64_sve_fmul_u: - return instCombineSVEVectorMul(IC, II, IInfo); + return instCombineSVEVectorBinOp(IC, II); case Intrinsic::aarch64_sve_fsub: return instCombineSVEVectorFSub(IC, II); case Intrinsic::aarch64_sve_fsub_u: @@ -2703,10 +2703,6 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return instCombineSVEVectorFuseMulAddSub( IC, II, true); - case Intrinsic::aarch64_sve_mul: - return instCombineSVEVectorMul(IC, II, IInfo); - case Intrinsic::aarch64_sve_mul_u: - return instCombineSVEVectorMul(IC, II, IInfo); case Intrinsic::aarch64_sve_sub: return instCombineSVEVectorSub(IC, II); case Intrinsic::aarch64_sve_sub_u: From 3c3fb357a0ed4dbf640bdb6c61db2a430f7eb298 Mon Sep 17 00:00:00 2001 From: TatWai Chong Date: Wed, 23 Apr 2025 03:03:28 -0700 Subject: [PATCH 040/245] [mlir][tosa] Enhance CONV3D & DEPTHWISE_CONV2D verifier (#135738) Verify the correctness of pad, stride, dilation, and dimension of input/weight/bias/output. Adapt and extend the existing conv2d error_if function to support additional convolution variants. --- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 244 +++++++++++------- .../TosaToLinalg/tosa-to-linalg-named.mlir | 44 ++-- mlir/test/Dialect/Tosa/availability.mlir | 4 +- mlir/test/Dialect/Tosa/canonicalize.mlir | 12 +- mlir/test/Dialect/Tosa/invalid_extension.mlir | 8 +- mlir/test/Dialect/Tosa/level_check.mlir | 120 ++++----- mlir/test/Dialect/Tosa/ops.mlir | 16 +- .../Tosa/profile_pro_fp_unsupported.mlir | 4 +- .../Tosa/profile_pro_int_unsupported.mlir | 4 +- mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir | 35 ++- mlir/test/Dialect/Tosa/verifier.mlir | 152 +++++++++++ 11 files changed, 425 insertions(+), 218 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 8b4f6ef0d0980..1ab4ce7d4558b 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -428,6 +428,150 @@ static LogicalResult verifyConvOpModes(T op) { return success(); } +//===----------------------------------------------------------------------===// +// ERROR_IF functions. +// ERROR_IF is a predicate that must set an error if the condition holds. +//===----------------------------------------------------------------------===// + +template +static LogicalResult verifyConvOpErrorIf(T op) { + llvm::ArrayRef padding = op.getPad(); + if (llvm::any_of(padding, [](int64_t p) { return p < 0; })) + return op.emitOpError("expect all padding values to be >= 0, got ") + << padding; + + llvm::ArrayRef strides = op.getStride(); + if (llvm::any_of(strides, [](int64_t s) { return s < 1; })) + return op.emitOpError("expect all stride values to be >= 1, got ") + << strides; + + llvm::ArrayRef dilations = op.getDilation(); + if (llvm::any_of(dilations, [](int64_t d) { return d < 1; })) + return op.emitOpError("expect all dilation values to be >= 1, got ") + << dilations; + + const RankedTensorType outputType = + llvm::dyn_cast(op.getOutput().getType()); + if (!outputType) + // Skip following checks if output is not ranked + return success(); + + const RankedTensorType inputType = + llvm::dyn_cast(op.getInput().getType()); + const RankedTensorType weightType = + llvm::dyn_cast(op.getWeight().getType()); + + if (inputType && weightType) { + const auto verifyOutputSize = + [&op](const int64_t inputSize, const int64_t kernelSize, + const int64_t outputSize, const int64_t padBefore, + const int64_t padAfter, const int64_t stride, + const int64_t dilation, const llvm::StringRef dimName, + const llvm::StringRef dimAxis, + const llvm::StringRef padBeforeName, + const llvm::StringRef padAfterName) -> LogicalResult { + if (inputSize == ShapedType::kDynamic || + kernelSize == ShapedType::kDynamic) + return success(); + + // ERROR_IF: O != idiv_check(I - 1 + pa + pb - (K - 1) * d, s) + 1 + + const std::optional calculatedOutSizeMinusOne = idivCheck( + inputSize - 1 + padBefore + padAfter - (kernelSize - 1) * dilation, + stride); + if (!calculatedOutSizeMinusOne.has_value()) + return op.emitOpError("expected input_") + << dimName << " - 1 + pad_" << padBeforeName << " + pad_" + << padAfterName << " - (kernel_" << dimName + << " - 1) * dilation_" << dimAxis + << " to be wholly divisible by stride_" << dimAxis << ", got (" + << inputSize << " - 1 + " << padBefore << " + " << padAfter + << " - (" << kernelSize << " - 1) * " << dilation << ") / " + << stride; + + const int64_t calculatedOutSize = calculatedOutSizeMinusOne.value() + 1; + if (outputSize != ShapedType::kDynamic && calculatedOutSize != outputSize) + return op.emitOpError("calculated output ") + << dimName << " did not match expected: " + << "calculated=" << calculatedOutSize + << ", expected=" << outputSize; + + return success(); + }; + + // input = [_,IH,IW,_], weight = [_,KH,KW,_], output = [_,OH,OW,_] + if constexpr (std::is_same::value) { + if (failed(verifyOutputSize( + inputType.getDimSize(1), weightType.getDimSize(1), + outputType.getDimSize(1), padding[0], padding[1], strides[0], + dilations[0], "height", "y", "top", "bottom"))) + return failure(); + + if (failed(verifyOutputSize( + inputType.getDimSize(2), weightType.getDimSize(2), + outputType.getDimSize(2), padding[2], padding[3], strides[1], + dilations[1], "width", "x", "left", "right"))) + return failure(); + } + + // input = [_,IH,IW,_], weight = [KH,KW,_,_], output = [_,OH,OW,_] + if constexpr (std::is_same::value) { + if (failed(verifyOutputSize( + inputType.getDimSize(1), weightType.getDimSize(0), + outputType.getDimSize(1), padding[0], padding[1], strides[0], + dilations[0], "height", "y", "top", "bottom"))) + return failure(); + + if (failed(verifyOutputSize( + inputType.getDimSize(2), weightType.getDimSize(1), + outputType.getDimSize(2), padding[2], padding[3], strides[1], + dilations[1], "width", "x", "left", "right"))) + return failure(); + } + + // input = [_,ID,IH,IW,_], weight = [_,KD,KH,KW,_], output = [_,OD,OH,OW,_] + if constexpr (std::is_same::value) { + if (failed(verifyOutputSize( + inputType.getDimSize(1), weightType.getDimSize(1), + outputType.getDimSize(1), padding[0], padding[1], strides[0], + dilations[0], "depth", "d", "front", "back"))) + return failure(); + + if (failed(verifyOutputSize( + inputType.getDimSize(2), weightType.getDimSize(2), + outputType.getDimSize(2), padding[2], padding[3], strides[1], + dilations[1], "height", "y", "top", "bottom"))) + return failure(); + + if (failed(verifyOutputSize( + inputType.getDimSize(3), weightType.getDimSize(3), + outputType.getDimSize(3), padding[4], padding[5], strides[2], + dilations[2], "width", "x", "left", "right"))) + return failure(); + } + } + + const RankedTensorType biasType = + llvm::dyn_cast(op.getBias().getType()); + if (!biasType) + // Skip following checks if bias is not ranked + return success(); + + const int64_t biasChannels = biasType.getDimSize(0); + const int64_t outputChannels = outputType.getDimSize(3); + if (biasChannels == ShapedType::kDynamic || + outputChannels == ShapedType::kDynamic) + // Skip following checks if biasChannels or outputChannels is dynamic dim + return success(); + + if (biasChannels != outputChannels && biasChannels != 1) + return op.emitOpError( + "bias channels expected to be equal to output channels (") + << outputChannels << ") or 1, got " << biasChannels; + + return success(); +} + // verify that inType and outType have same element types template static LogicalResult verifySameElementTypes(T op, Type inType, Type outType) { @@ -2586,99 +2730,9 @@ LogicalResult Conv2DOp::inferReturnTypeComponents( } LogicalResult Conv2DOp::verify() { - if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed()) + if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed() || + verifyConvOpErrorIf(*this).failed()) return failure(); - - llvm::ArrayRef padding = getPad(); - if (llvm::any_of(padding, [](int64_t p) { return p < 0; })) - return emitOpError("expect all padding values to be >= 0, got ") << padding; - - llvm::ArrayRef strides = getStride(); - if (llvm::any_of(strides, [](int64_t s) { return s < 1; })) - return emitOpError("expect all stride values to be >= 1, got ") << strides; - - llvm::ArrayRef dilations = getDilation(); - if (llvm::any_of(dilations, [](int64_t d) { return d < 1; })) - return emitOpError("expect all dilation values to be >= 1, got ") - << dilations; - - const RankedTensorType outputType = - llvm::dyn_cast(getOutput().getType()); - if (!outputType) - // Skip following checks if output is not ranked - return success(); - - const RankedTensorType inputType = - llvm::dyn_cast(getInput().getType()); - const RankedTensorType weightType = - llvm::dyn_cast(getWeight().getType()); - - if (inputType && weightType) { - const auto verifyOutputSize = - [this](const int64_t inputSize, const int64_t kernelSize, - const int64_t outputSize, const int64_t padBefore, - const int64_t padAfter, const int64_t stride, - const int64_t dilation, const llvm::StringRef dimName, - const llvm::StringRef dimAxis, - const llvm::StringRef padBeforeName, - const llvm::StringRef padAfterName) -> LogicalResult { - if (inputSize == ShapedType::kDynamic || - kernelSize == ShapedType::kDynamic) - return success(); - - const std::optional calculatedOutSizeMinusOne = idivCheck( - inputSize - 1 + padBefore + padAfter - (kernelSize - 1) * dilation, - stride); - if (!calculatedOutSizeMinusOne.has_value()) - return emitOpError("expected input_") - << dimName << " - 1 + pad_" << padBeforeName << " + pad_" - << padAfterName << " - (kernel_" << dimName - << " - 1) * dilation_" << dimAxis - << " to be wholly divisible by stride_" << dimAxis << ", got (" - << inputSize << " - 1 + " << padBefore << " + " << padAfter - << " - (" << kernelSize << " - 1) * " << dilation << ") / " - << stride; - - const int64_t calculatedOutSize = calculatedOutSizeMinusOne.value() + 1; - if (outputSize != ShapedType::kDynamic && calculatedOutSize != outputSize) - return emitOpError("calculated output ") - << dimName << " did not match expected: " - << "calculated=" << calculatedOutSize - << ", expected=" << outputSize; - - return success(); - }; - - if (failed(verifyOutputSize( - inputType.getDimSize(1), weightType.getDimSize(1), - outputType.getDimSize(1), padding[0], padding[1], strides[0], - dilations[0], "height", "y", "top", "bottom"))) - return failure(); - - if (failed(verifyOutputSize( - inputType.getDimSize(2), weightType.getDimSize(2), - outputType.getDimSize(2), padding[2], padding[3], strides[1], - dilations[1], "width", "x", "left", "right"))) - return failure(); - } - - const RankedTensorType biasType = - llvm::dyn_cast(getBias().getType()); - if (!biasType) - // Skip following checks if bias is not ranked - return success(); - - const int64_t biasChannels = biasType.getDimSize(0); - const int64_t outputChannels = outputType.getDimSize(3); - if (biasChannels == ShapedType::kDynamic || - outputChannels == ShapedType::kDynamic) - // Skip following checks if biasChannels or outputChannels is dynamic dim - return success(); - - if (biasChannels != outputChannels && biasChannels != 1) - return emitOpError( - "bias channels expected to be equal to output channels (") - << outputChannels << ") or 1, got " << biasChannels; return success(); } @@ -2753,7 +2807,8 @@ LogicalResult Conv3DOp::inferReturnTypeComponents( } LogicalResult Conv3DOp::verify() { - if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed()) + if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed() || + verifyConvOpErrorIf(*this).failed()) return failure(); return success(); } @@ -2863,7 +2918,8 @@ LogicalResult DepthwiseConv2DOp::inferReturnTypeComponents( } LogicalResult DepthwiseConv2DOp::verify() { - if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed()) + if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed() || + verifyConvOpErrorIf(*this).failed()) return failure(); return success(); } diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir index 242772fe5cdcf..a737a8a05bae6 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir @@ -878,22 +878,22 @@ func.func @depthwise_conv2d_f16_f32_acc(%arg0 : tensor<1x7x5x3xf16>, %arg1 : ten // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> // CHECK-LABEL: @conv3d_f32 -func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4x5x27xf32>, %bias: tensor<28xf32>) -> () { - // CHECK-DAG: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xf32>) permutation = [1, 2, 3, 4, 0] - // CHECK-DAG: %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xf32> +func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<43x3x4x5x27xf32>, %bias: tensor<43xf32>) -> () { + // CHECK-DAG: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<43x3x4x5x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x43xf32>) permutation = [1, 2, 3, 4, 0] + // CHECK-DAG: %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x43xf32> // CHECK: %[[BROADCAST:.+]] = linalg.generic // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} - // CHECK-SAME: ins(%arg2 : tensor<28xf32>) outs(%[[INIT]] : tensor<1x47x45x43x28xf32>) { + // CHECK-SAME: ins(%arg2 : tensor<43xf32>) outs(%[[INIT]] : tensor<1x47x45x43x43xf32>) { // CHECK: ^bb0(%[[IN:.+]]: f32, %[[OUT:.+]]: f32): // CHECK: linalg.yield %[[IN]] : f32 - // CHECK: } -> tensor<1x47x45x43x28xf32> + // CHECK: } -> tensor<1x47x45x43x43xf32> // CHECK: linalg.conv_3d_ndhwc_dhwcf // CHECK-SAME: {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} - // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x48x47x27xf32>, tensor<3x4x5x27x28xf32>) - // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x28xf32>) -> tensor<1x47x45x43x28xf32> + // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x48x47x27xf32>, tensor<3x4x5x27x43xf32>) + // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x43xf32>) -> tensor<1x47x45x43x43xf32> %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> %weight_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> - %0 = tosa.conv3d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<28xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x47x45x43x28xf32> + %0 = tosa.conv3d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<1x49x48x47x27xf32>, tensor<43x3x4x5x27xf32>, tensor<43xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x47x45x43x43xf32> return } @@ -919,40 +919,40 @@ func.func @conv3d_scalar_bias_f32(%input: tensor<1x49x48x47x27xf32>, %weights: t // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> // CHECK-LABEL: @conv3d_i8 -func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5x27xi8>, %bias: tensor<28xi32>) -> () { - // CHECK-DAG: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xi8>) permutation = [1, 2, 3, 4, 0] - // CHECK-DAG: %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xi32> +func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<43x3x4x5x27xi8>, %bias: tensor<43xi32>) -> () { + // CHECK-DAG: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<43x3x4x5x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x43xi8>) permutation = [1, 2, 3, 4, 0] + // CHECK-DAG: %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x43xi32> // CHECK: %[[BROADCAST:.+]] = linalg.generic // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} - // CHECK-SAME: ins(%arg2 : tensor<28xi32>) - // CHECK-SAME: outs(%[[INIT]] : tensor<1x47x45x43x28xi32>) { + // CHECK-SAME: ins(%arg2 : tensor<43xi32>) + // CHECK-SAME: outs(%[[INIT]] : tensor<1x47x45x43x43xi32>) { // CHECK: ^bb0(%[[IN:.+]]: i32, %[[OUT:.+]]: i32): // CHECK: linalg.yield %[[IN]] : i32 - // CHECK: } -> tensor<1x47x45x43x28xi32> + // CHECK: } -> tensor<1x47x45x43x43xi32> // CHECK: %[[IZP:.+]] = arith.constant -128 : i32 // CHECK: %[[FZP:.+]] = arith.constant 42 : i32 // CHECK: linalg.conv_3d_ndhwc_dhwcf_q // CHECK-SAME: {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} - // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]], %[[IZP]], %[[FZP]] : tensor<1x49x48x47x27xi8>, tensor<3x4x5x27x28xi8>, i32, i32) - // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x28xi32>) -> tensor<1x47x45x43x28xi32> + // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]], %[[IZP]], %[[FZP]] : tensor<1x49x48x47x27xi8>, tensor<3x4x5x27x43xi8>, i32, i32) + // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x43xi32>) -> tensor<1x47x45x43x43xi32> %input_zp = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8> %weight_zp = "tosa.const"() <{values = dense<42> : tensor<1xi8>}> : () -> tensor<1xi8> - %0 = tosa.conv3d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = i32, pad = array, stride = array, dilation = array} : (tensor<1x49x48x47x27xi8>, tensor<28x3x4x5x27xi8>, tensor<28xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x47x45x43x28xi32> + %0 = tosa.conv3d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = i32, pad = array, stride = array, dilation = array} : (tensor<1x49x48x47x27xi8>, tensor<43x3x4x5x27xi8>, tensor<43xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x47x45x43x43xi32> return } // ----- // CHECK-LABEL: @conv3d_f16_f32_acc -func.func @conv3d_f16_f32_acc(%input: tensor<1x49x48x47x27xf16>, %weights: tensor<28x3x4x5x27xf16>, %bias: tensor<28xf16>) -> () { +func.func @conv3d_f16_f32_acc(%input: tensor<1x49x48x47x27xf16>, %weights: tensor<43x3x4x5x27xf16>, %bias: tensor<43xf16>) -> () { %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf16>}> : () -> tensor<1xf16> %weight_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf16>}> : () -> tensor<1xf16> - // CHECK: linalg.generic {{{.*}}} ins(%{{.*}} : tensor<28xf16>) outs(%{{.*}} : tensor<1x47x45x43x28xf32>) + // CHECK: linalg.generic {{{.*}}} ins(%{{.*}} : tensor<43xf16>) outs(%{{.*}} : tensor<1x47x45x43x43xf32>) // CHECK: arith.extf %{{.*}} : f16 to f32 - // CHECK: %[[CONV:.*]] = linalg.conv_3d_ndhwc_dhwcf {{{.*}}} ins(%{{.*}}, %{{.*}} : tensor<1x49x48x47x27xf16>, tensor<3x4x5x27x28xf16>) outs(%{{.*}} : tensor<1x47x45x43x28xf32>) -> tensor<1x47x45x43x28xf32> - // CHECK: tosa.cast %[[CONV]] : (tensor<1x47x45x43x28xf32>) -> tensor<1x47x45x43x28xf16> - %0 = tosa.conv3d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<1x49x48x47x27xf16>, tensor<28x3x4x5x27xf16>, tensor<28xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x47x45x43x28xf16> + // CHECK: %[[CONV:.*]] = linalg.conv_3d_ndhwc_dhwcf {{{.*}}} ins(%{{.*}}, %{{.*}} : tensor<1x49x48x47x27xf16>, tensor<3x4x5x27x43xf16>) outs(%{{.*}} : tensor<1x47x45x43x43xf32>) -> tensor<1x47x45x43x43xf32> + // CHECK: tosa.cast %[[CONV]] : (tensor<1x47x45x43x43xf32>) -> tensor<1x47x45x43x43xf16> + %0 = tosa.conv3d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<1x49x48x47x27xf16>, tensor<43x3x4x5x27xf16>, tensor<43xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x47x45x43x43xf16> return } diff --git a/mlir/test/Dialect/Tosa/availability.mlir b/mlir/test/Dialect/Tosa/availability.mlir index 75126a11ac504..7374cfd1145b9 100644 --- a/mlir/test/Dialect/Tosa/availability.mlir +++ b/mlir/test/Dialect/Tosa/availability.mlir @@ -38,12 +38,12 @@ func.func @test_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<8x1x1x4xf32>, % // ----- // CHECK-LABEL: conv3d -func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<34xf32>) -> tensor<1x4x8x21x34xf32> { +func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>) -> tensor<1x4x8x21x34xf32> { // CHECK: profiles: [ [pro_int, pro_fp] ] // CHECK: extensions: [ [int4, int16, fp8e4m3, fp8e5m2, bf16] ] %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> %weight_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> - %0 = tosa.conv3d %arg0, %arg1, %arg2, %input_zp, %weight_zp {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> + %0 = tosa.conv3d %arg0, %arg1, %arg2, %input_zp, %weight_zp {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> return %0 : tensor<1x4x8x21x34xf32> } diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir index d153474593d80..59fd490330691 100644 --- a/mlir/test/Dialect/Tosa/canonicalize.mlir +++ b/mlir/test/Dialect/Tosa/canonicalize.mlir @@ -379,19 +379,19 @@ func.func @conv2d_weight_2x2(%arg0: tensor<4x10x10x1xf32>) -> tensor<4x9x9x1xf32 // ----- // CHECK-LABEL: @depthwise_conv2d_stride_2 -func.func @depthwise_conv2d_stride_2(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<4x10x10x6xf32> { +func.func @depthwise_conv2d_stride_2(%arg0: tensor<4x11x11x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<4x6x6x6xf32> { // CHECK: tosa.depthwise_conv2d - %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<4x10x10x6xf32> - return %0 : tensor<4x10x10x6xf32> + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<4x11x11x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<4x6x6x6xf32> + return %0 : tensor<4x6x6x6xf32> } // ----- // CHECK-LABEL: @depthwise_conv2d_weight_2x2 -func.func @depthwise_conv2d_weight_2x2(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<2x2x2x3xf32>, %arg2: tensor<6xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<4x10x10x6xf32> { +func.func @depthwise_conv2d_weight_2x2(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<2x2x2x3xf32>, %arg2: tensor<6xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<4x9x9x6xf32> { // CHECK: tosa.depthwise_conv2d - %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<4x10x10x2xf32>, tensor<2x2x2x3xf32>, tensor<6xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<4x10x10x6xf32> - return %0 : tensor<4x10x10x6xf32> + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<4x10x10x2xf32>, tensor<2x2x2x3xf32>, tensor<6xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<4x9x9x6xf32> + return %0 : tensor<4x9x9x6xf32> } // ----- diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir index bb0d3b46955a1..c862ae375f33b 100644 --- a/mlir/test/Dialect/Tosa/invalid_extension.mlir +++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir @@ -26,9 +26,9 @@ func.func @test_conv2d(%arg0: tensor<1x4x4x4xi8>, %arg1: tensor<8x1x1x4xi4>, %ar } // ----- -func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xi16>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<34xi48>, %arg3: tensor<1xi16>, %arg4: tensor<1xi8>) -> tensor<1x4x8x21x34xi48> { +func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xi16>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<21xi48>, %arg3: tensor<1xi16>, %arg4: tensor<1xi8>) -> tensor<1x4x8x21x34xi48> { // expected-error@+1 {{'tosa.conv3d' op illegal: requires [int16] but not enabled in target}} - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = i48, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xi16>, tensor<34x1x1x1x17xi8>, tensor<34xi48>, tensor<1xi16>, tensor<1xi8>) -> tensor<1x4x8x21x34xi48> + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = i48, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xi16>, tensor<34x1x1x1x17xi8>, tensor<21xi48>, tensor<1xi16>, tensor<1xi8>) -> tensor<1x4x8x21x34xi48> return %0 : tensor<1x4x8x21x34xi48> } @@ -445,10 +445,10 @@ func.func @test_conv2d_non_const_input_zp(%arg0: tensor<1x4x4x4xi8>, %arg1: tens // ----- -func.func @test_conv3d_non_const_weight_zp(%arg0: tensor<1x4x8x21x17xi8>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<34xi32>, %arg3: tensor<1xi8>) -> tensor<1x4x8x21x34xi32> { +func.func @test_conv3d_non_const_weight_zp(%arg0: tensor<1x4x8x21x17xi8>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<21xi32>, %arg3: tensor<1xi8>) -> tensor<1x4x8x21x34xi32> { %input_zp = "tosa.const"() {values = dense<0> : tensor<1xi8> } : () -> tensor<1xi8> // expected-error@+1 {{'tosa.conv3d' op expected compile time resolvable constant, but got variable value for operand #4}} - %0 = tosa.conv3d %arg0, %arg1, %arg2, %input_zp, %arg3 {acc_type = i32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xi8>, tensor<34x1x1x1x17xi8>, tensor<34xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x8x21x34xi32> + %0 = tosa.conv3d %arg0, %arg1, %arg2, %input_zp, %arg3 {acc_type = i32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xi8>, tensor<34x1x1x1x17xi8>, tensor<21xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x8x21x34xi32> return %0 : tensor<1x4x8x21x34xi32> } diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir index 8d91142c678fe..5307645324b81 100644 --- a/mlir/test/Dialect/Tosa/level_check.mlir +++ b/mlir/test/Dialect/Tosa/level_check.mlir @@ -619,182 +619,182 @@ func.func @test_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2 // ----- -func.func @test_conv3d_dilation_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_dilation_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: dilation_d * KD <= MAX_KERNEL}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_dilation_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_dilation_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: dilation_y * KH <= MAX_KERNEL}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_dilation_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_dilation_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: dilation_x * KW <= MAX_KERNEL}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_pad_d0(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_pad_d0(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_pad_d1(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_pad_d1(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_pad_top(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_pad_top(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_pad_bottom(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_pad_bottom(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_pad_left(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_pad_left(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_pad_right(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_pad_right(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_stride_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_stride_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: stride <= MAX_STRIDE}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_stride_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_stride_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: stride <= MAX_STRIDE}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_conv3d_stride_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x1x32x32x16xf32> { +func.func @test_conv3d_stride_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.conv3d' op failed level check: stride <= MAX_STRIDE}} %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x32x32x16xf32> - return %0 : tensor<1x1x32x32x16xf32> + (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_depthwise_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x64xf32> { +func.func @test_depthwise_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: dilation_y * KH <= MAX_KERNEL}} %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x64xf32> - return %0 : tensor<1x32x32x64xf32> + (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_depthwise_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x64xf32> { +func.func @test_depthwise_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: dilation_x * KW <= MAX_KERNEL}} %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x64xf32> - return %0 : tensor<1x32x32x64xf32> + (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_depthwise_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x64xf32> { +func.func @test_depthwise_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x64xf32> - return %0 : tensor<1x32x32x64xf32> + (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_depthwise_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x64xf32> { +func.func @test_depthwise_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x64xf32> - return %0 : tensor<1x32x32x64xf32> + (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_depthwise_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x64xf32> { +func.func @test_depthwise_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x64xf32> - return %0 : tensor<1x32x32x64xf32> + (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_depthwise_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x64xf32> { +func.func @test_depthwise_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x64xf32> - return %0 : tensor<1x32x32x64xf32> + (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_depthwise_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x64xf32> { +func.func @test_depthwise_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: stride <= MAX_STRIDE}} %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x64xf32> - return %0 : tensor<1x32x32x64xf32> + (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- -func.func @test_depthwise_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x64xf32> { +func.func @test_depthwise_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<*xf32> { // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: stride <= MAX_STRIDE}} %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x64xf32> - return %0 : tensor<1x32x32x64xf32> + (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> } // ----- diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index b64074e412ed1..c1181825f0c97 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -104,15 +104,15 @@ func.func @test_conv2d_q8xi4(%arg0: tensor<1x11x11x3xi8>) -> tensor<1x1x1x3xi8> // ----- // CHECK-LABEL: conv3d -func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<34xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> +func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> return %0 : tensor<1x4x8x21x34xf32> } // ----- // CHECK-LABEL: conv3d_with_local_bound -func.func @test_conv3d_with_local_bound(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<34xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> +func.func @test_conv3d_with_local_bound(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> return %0 : tensor<1x4x8x21x34xf32> } @@ -823,8 +823,8 @@ func.func @test_conv2d_f8E5M2(%arg0: tensor<1x4x4x4xf8E5M2>, %arg1: tensor<8x1x1 // ----- // CHECK-LABEL: conv3d_f8E5M2 -func.func @test_conv3d_f8E5M2(%arg0: tensor<1x4x8x21x17xf8E5M2>, %arg1: tensor<34x1x1x1x17xf8E5M2>, %arg2: tensor<34xf16>, %arg3: tensor<1xf8E5M2>, %arg4: tensor<1xf8E5M2>) -> tensor<1x4x8x21x34xf16> { - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f16, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf8E5M2>, tensor<34x1x1x1x17xf8E5M2>, tensor<34xf16>, tensor<1xf8E5M2>, tensor<1xf8E5M2>) -> tensor<1x4x8x21x34xf16> +func.func @test_conv3d_f8E5M2(%arg0: tensor<1x4x8x21x17xf8E5M2>, %arg1: tensor<34x1x1x1x17xf8E5M2>, %arg2: tensor<21xf16>, %arg3: tensor<1xf8E5M2>, %arg4: tensor<1xf8E5M2>) -> tensor<1x4x8x21x34xf16> { + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f16, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf8E5M2>, tensor<34x1x1x1x17xf8E5M2>, tensor<21xf16>, tensor<1xf8E5M2>, tensor<1xf8E5M2>) -> tensor<1x4x8x21x34xf16> return %0 : tensor<1x4x8x21x34xf16> } @@ -968,8 +968,8 @@ func.func @test_conv2d_f8E4M3FN(%arg0: tensor<1x4x4x4xf8E4M3FN>, %arg1: tensor<8 // ----- // CHECK-LABEL: conv3d_f8E4M3FN -func.func @test_conv3d_f8E4M3FN(%arg0: tensor<1x4x8x21x17xf8E4M3FN>, %arg1: tensor<34x1x1x1x17xf8E4M3FN>, %arg2: tensor<34xf16>, %arg3: tensor<1xf8E4M3FN>, %arg4: tensor<1xf8E4M3FN>) -> tensor<1x4x8x21x34xf16> { - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f16, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf8E4M3FN>, tensor<34x1x1x1x17xf8E4M3FN>, tensor<34xf16>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>) -> tensor<1x4x8x21x34xf16> +func.func @test_conv3d_f8E4M3FN(%arg0: tensor<1x4x8x21x17xf8E4M3FN>, %arg1: tensor<34x1x1x1x17xf8E4M3FN>, %arg2: tensor<21xf16>, %arg3: tensor<1xf8E4M3FN>, %arg4: tensor<1xf8E4M3FN>) -> tensor<1x4x8x21x34xf16> { + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f16, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf8E4M3FN>, tensor<34x1x1x1x17xf8E4M3FN>, tensor<21xf16>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>) -> tensor<1x4x8x21x34xf16> return %0 : tensor<1x4x8x21x34xf16> } diff --git a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir index 72669c62c95ca..efbb9e9d1843f 100644 --- a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir +++ b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir @@ -33,9 +33,9 @@ func.func @test_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<8x1x1x4xf32>, % } // ----- -func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xf16>, %arg1: tensor<34x1x1x1x17xf16>, %arg2: tensor<34xf16>, %arg3: tensor<1xf16>, %arg4: tensor<1xf16>) -> tensor<1x4x8x21x34xf16> { +func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xf16>, %arg1: tensor<34x1x1x1x17xf16>, %arg2: tensor<21xf16>, %arg3: tensor<1xf16>, %arg4: tensor<1xf16>) -> tensor<1x4x8x21x34xf16> { // expected-error@+1 {{'tosa.conv3d' op illegal: requires [pro_fp] but not enabled in target}} - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf16>, tensor<34x1x1x1x17xf16>, tensor<34xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x4x8x21x34xf16> + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf16>, tensor<34x1x1x1x17xf16>, tensor<21xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x4x8x21x34xf16> return %0 : tensor<1x4x8x21x34xf16> } diff --git a/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir index e98b906377b22..b102eea5699dd 100644 --- a/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir +++ b/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir @@ -38,9 +38,9 @@ func.func @test_conv2d(%arg0: tensor<1x4x4x4xi8>, %arg1: tensor<8x1x1x4xi8>, %ar } // ----- -func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xi8>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<34xi32>, %arg3: tensor<1xi8>, %arg4: tensor<1xi8>) -> tensor<1x4x8x21x34xi32> { +func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xi8>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<21xi32>, %arg3: tensor<1xi8>, %arg4: tensor<1xi8>) -> tensor<1x4x8x21x34xi32> { // expected-error@+1 {{'tosa.conv3d' op illegal: requires [pro_int] but not enabled in target}} - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = i32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xi8>, tensor<34x1x1x1x17xi8>, tensor<34xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x8x21x34xi32> + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = i32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xi8>, tensor<34x1x1x1x17xi8>, tensor<21xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x8x21x34xi32> return %0 : tensor<1x4x8x21x34xi32> } diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir index fe9da2ac09650..c6ac8074c0326 100644 --- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir +++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir @@ -824,27 +824,27 @@ func.func @conv2d_strided(%input: tensor<1x13x15x1xf32>, %weights: tensor<1x1x1x // ----- // CHECK-LABEL: @conv3d_static -func.func @conv3d_static(%input: tensor<2x8x9x10x3xf32>, %weights: tensor<5x3x6x4x3xf32>, %bias: tensor<5xf32>, %input_zp: tensor<1xf32>, %weight_zp: tensor<1xf32>) -> () { +func.func @conv3d_static(%input: tensor<2x8x9x10x3xf32>, %weights: tensor<5x3x6x4x3xf32>, %bias: tensor<7xf32>, %input_zp: tensor<1xf32>, %weight_zp: tensor<1xf32>) -> () { // CHECK: -> tensor<2x6x4x7x5xf32> - %0 = tosa.conv3d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor + %0 = tosa.conv3d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<7xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor return } // ----- // CHECK-LABEL: @conv3d_dynamic_input -func.func @conv3d_dynamic_input(%arg0: tensor, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<5xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { +func.func @conv3d_dynamic_input(%arg0: tensor, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<7xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { // CHECK: -> tensor - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor, tensor<5x3x6x4x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor, tensor<5x3x6x4x3xf32>, tensor<7xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor return } // ----- // CHECK-LABEL: @conv3d_dynamic_weight -func.func @conv3d_dynamic_weight(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor, %arg2: tensor<5xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { - // CHECK: -> tensor<2x?x?x?x5xf32> - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<2x8x9x10x3xf32>, tensor, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor +func.func @conv3d_dynamic_weight(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor, %arg2: tensor<7xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { + // CHECK: -> tensor<2x?x?x?x7xf32> + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<2x8x9x10x3xf32>, tensor, tensor<7xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor return } @@ -860,27 +860,27 @@ func.func @conv3d_dynamic_bias(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x // ----- // CHECK-LABEL: @conv3d_padded -func.func @conv3d_padded(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<5xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { +func.func @conv3d_padded(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<18xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { // CHECK: -> tensor<2x9x11x18x5xf32> - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<18xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor return } // ----- // CHECK-LABEL: @conv3d_dilated -func.func @conv3d_dilated(%arg0: tensor<2x12x14x16x3xf32>, %arg1: tensor<5x3x6x2x3xf32>, %arg2: tensor<5xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { +func.func @conv3d_dilated(%arg0: tensor<2x12x14x16x3xf32>, %arg1: tensor<5x3x6x2x3xf32>, %arg2: tensor<12xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { // CHECK: -> tensor<2x6x4x12x5xf32> - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<2x12x14x16x3xf32>, tensor<5x3x6x2x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<2x12x14x16x3xf32>, tensor<5x3x6x2x3xf32>, tensor<12xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor return } // ----- // CHECK-LABEL: @conv3d_strided -func.func @conv3d_strided(%arg0: tensor<1x13x14x15x1xf32>, %arg1: tensor<1x1x1x1x1xf32>, %arg2: tensor<1xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { - // CHECK: -> tensor<1x5x7x4x1xf32> - %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x13x14x15x1xf32>, tensor<1x1x1x1x1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor +func.func @conv3d_strided(%arg0: tensor<1x13x17x17x1xf32>, %arg1: tensor<1x1x1x1x1xf32>, %arg2: tensor<1xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { + // CHECK: -> tensor<1x5x9x5x1xf32> + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x13x17x17x1xf32>, tensor<1x1x1x1x1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor return } @@ -941,9 +941,9 @@ func.func @depthwise_conv2d_dilated(%arg0: tensor<2x12x14x3xf32>, %arg1: tensor< // ----- // CHECK-LABEL: @depthwise_conv2d_strided -func.func @depthwise_conv2d_strided(%arg0: tensor<1x13x14x1xf32>, %arg1: tensor<1x1x1x1xf32>, %arg2: tensor<1xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { - // CHECK: -> tensor<1x5x7x1xf32> - %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x13x14x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x5x7x1xf32> +func.func @depthwise_conv2d_strided(%arg0: tensor<1x13x15x1xf32>, %arg1: tensor<1x1x1x1xf32>, %arg2: tensor<1xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { + // CHECK: -> tensor<1x5x8x1xf32> + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x13x15x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x5x8x1xf32> return } @@ -1396,7 +1396,6 @@ func.func @test_dynamic_batch_fft2d(%arg0: tensor, %arg1: tensor, %arg1 : tensor) -> () { // CHECK: tosa.equal %arg0, %arg1 : (tensor<*xf32>, tensor) -> tensor<*xi1> %0 = tosa.equal %arg0, %arg1 : (tensor<*xf32>, tensor) -> tensor<*xi1> - return } diff --git a/mlir/test/Dialect/Tosa/verifier.mlir b/mlir/test/Dialect/Tosa/verifier.mlir index efdd26a9346fb..fb8726cba1853 100644 --- a/mlir/test/Dialect/Tosa/verifier.mlir +++ b/mlir/test/Dialect/Tosa/verifier.mlir @@ -167,3 +167,155 @@ func.func @test_scalar_slice(%arg0: tensor) -> tensor { %2 = tosa.slice %arg0, %0, %1 : (tensor, !tosa.shape<0>, !tosa.shape<0>) -> tensor return %2 : tensor } + +// ----- + +func.func @test_depthwise_conv2d_invalid_padding(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x8x4xf32>, %arg2: tensor<8xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x4x8xf32> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op expect all padding values to be >= 0, got 0, 0, -1, 0}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} + : (tensor<1x4x4x4xf32>, tensor<1x1x8x4xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x4x8xf32> + return %0 : tensor<1x4x4x8xf32> +} + +// ----- + +func.func @test_depthwise_conv2d_invalid_stride(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x8x4xf32>, %arg2: tensor<8xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x4x8xf32> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op expect all stride values to be >= 1, got 0, 1}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} + : (tensor<1x4x4x4xf32>, tensor<1x1x8x4xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x4x8xf32> + return %0 : tensor<1x4x4x8xf32> +} + +// ----- + +func.func @test_depthwise_conv2d_invalid_dilation(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x8x4xf32>, %arg2: tensor<8xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x4x8xf32> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op expect all dilation values to be >= 1, got 1, 0}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} + : (tensor<1x4x4x4xf32>, tensor<1x1x8x4xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x4x8xf32> + return %0 : tensor<1x4x4x8xf32> +} + +// ----- + +func.func @test_depthwise_conv2d_wholly_divisible_height(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x8x4xf32>, %arg2: tensor<8xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x4x8xf32> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op expected input_height - 1 + pad_top + pad_bottom - (kernel_height - 1) * dilation_y to be wholly divisible by stride_y, got (4 - 1 + 0 + 0 - (1 - 1) * 1) / 2}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} + : (tensor<1x4x4x4xf32>, tensor<1x1x8x4xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x4x8xf32> + return %0 : tensor<1x4x4x8xf32> +} + +// ----- + +func.func @test_depthwise_conv2d_wholly_divisible_width(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x8x4xf32>, %arg2: tensor<8xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x4x8xf32> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op expected input_width - 1 + pad_left + pad_right - (kernel_width - 1) * dilation_x to be wholly divisible by stride_x, got (4 - 1 + 0 + 0 - (1 - 1) * 1) / 2}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} + : (tensor<1x4x4x4xf32>, tensor<1x1x8x4xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x4x8xf32> + return %0 : tensor<1x4x4x8xf32> +} + +// ----- + +func.func @test_depthwise_conv2d_unexpected_output_height(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x8x4xf32>, %arg2: tensor<8xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x6x4x8xf32> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op calculated output height did not match expected: calculated=4, expected=6}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} + : (tensor<1x4x4x4xf32>, tensor<1x1x8x4xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x6x4x8xf32> + return %0 : tensor<1x6x4x8xf32> +} + +// ----- + +func.func @test_depthwise_conv2d_unexpected_output_width(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x8x4xf32>, %arg2: tensor<8xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x6x8xf32> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op calculated output width did not match expected: calculated=4, expected=6}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} + : (tensor<1x4x4x4xf32>, tensor<1x1x8x4xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x6x8xf32> + return %0 : tensor<1x4x6x8xf32> +} + +// ----- + +func.func @test_depthwise_conv2d_invalid_bias_size(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x8x4xf32>, %arg2: tensor<7xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x4x8xf32> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op bias channels expected to be equal to output channels (8) or 1, got 7}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} + : (tensor<1x4x4x4xf32>, tensor<1x1x8x4xf32>, tensor<7xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x4x8xf32> + return %0 : tensor<1x4x4x8xf32> +} + +// ----- + +func.func @test_conv3d_invalid_padding(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { + // expected-error@+1 {{'tosa.conv3d' op expect all padding values to be >= 0, got 0, -1, 0, -1, 0, 0}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} + : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> + return %0 : tensor<1x4x8x21x34xf32> +} +// ----- + +func.func @test_conv3d_invalid_stride(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { + // expected-error@+1 {{'tosa.conv3d' op expect all stride values to be >= 1, got 0, 1, 1}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} + : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> + return %0 : tensor<1x4x8x21x34xf32> +} + +// ----- + +func.func @test_conv3d_invalid_dilation(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { + // expected-error@+1 {{'tosa.conv3d' op expect all dilation values to be >= 1, got 1, 0, 1}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} + : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> + return %0 : tensor<1x4x8x21x34xf32> +} + +// ----- + +func.func @test_conv3d_wholly_divisible_input_depth(%arg0: tensor<1x4x16x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { + // expected-error@+1 {{'tosa.conv3d' op expected input_depth - 1 + pad_front + pad_back - (kernel_depth - 1) * dilation_d to be wholly divisible by stride_d, got (4 - 1 + 0 + 0 - (1 - 1) * 1) / 2}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} + : (tensor<1x4x16x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> + return %0 : tensor<1x4x8x21x34xf32> +} + +// ----- + +func.func @test_conv3d_wholly_divisible_input_height(%arg0: tensor<1x4x10x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { + // expected-error@+1 {{'tosa.conv3d' op expected input_height - 1 + pad_top + pad_bottom - (kernel_height - 1) * dilation_y to be wholly divisible by stride_y, got (10 - 1 + 0 + 0 - (1 - 1) * 1) / 4}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} + : (tensor<1x4x10x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> + return %0 : tensor<1x4x8x21x34xf32> +} + +// ----- + +func.func @test_conv3d_wholly_divisible_input_width(%arg0: tensor<1x4x8x21x19xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { + // expected-error@+1 {{'tosa.conv3d' op expected input_width - 1 + pad_left + pad_right - (kernel_width - 1) * dilation_x to be wholly divisible by stride_x, got (21 - 1 + 0 + 0 - (1 - 1) * 1) / 8}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} + : (tensor<1x4x8x21x19xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> + return %0 : tensor<1x4x8x21x34xf32> +} + +// ----- + +func.func @test_conv3d_wholly_divisible_output_depth(%arg0: tensor<1x4x10x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x3x10x21x34xf32> { + // expected-error@+1 {{'tosa.conv3d' op calculated output depth did not match expected: calculated=4, expected=3}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} + : (tensor<1x4x10x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x3x10x21x34xf32> + return %0 : tensor<1x3x10x21x34xf32> +} + +// ----- + +func.func @test_conv3d_wholly_divisible_output_height(%arg0: tensor<1x4x16x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x21x34xf32> { + // expected-error@+1 {{'tosa.conv3d' op calculated output height did not match expected: calculated=16, expected=8}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} + : (tensor<1x4x16x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x21x34xf32> + return %0 : tensor<1x4x8x21x34xf32> +} + +// ----- + +func.func @test_conv3d_wholly_divisible_output_width(%arg0: tensor<1x4x8x21x19xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<21xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x8x19x34xf32> { + // expected-error@+1 {{'tosa.conv3d' op calculated output width did not match expected: calculated=21, expected=19}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} + : (tensor<1x4x8x21x19xf32>, tensor<34x1x1x1x17xf32>, tensor<21xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x8x19x34xf32> + return %0 : tensor<1x4x8x19x34xf32> +} From 8c47f23232fc8b547f643d379175f322d01e4cbd Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Wed, 23 Apr 2025 12:11:01 +0200 Subject: [PATCH 041/245] [SPIRV] Support for the SPV_INTEL_subgroup_matrix_multiply_accumulate SPIR-V extension (#135225) Adds support for the SPV_INTEL_subgroup_matrix_multiply_accumulate SPIR-V extension according to https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_subgroup_matrix_multiply_accumulate.asciidoc --- llvm/docs/SPIRVUsage.rst | 2 + llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 10 +- llvm/lib/Target/SPIRV/SPIRVBuiltins.td | 4 + llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp | 3 + llvm/lib/Target/SPIRV/SPIRVInstrInfo.td | 4 + llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 14 ++ .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 2 + ...roup_matrix_multiply_accumulate_generic.ll | 229 ++++++++++++++++++ 8 files changed, 266 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroup_matrix_multiply_accumulate/subgroup_matrix_multiply_accumulate_generic.ll diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst index 406dfbea20b73..6ff8034cac00c 100644 --- a/llvm/docs/SPIRVUsage.rst +++ b/llvm/docs/SPIRVUsage.rst @@ -211,6 +211,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na - Adds the ability to specify the maximum error for floating-point operations. * - ``SPV_INTEL_ternary_bitwise_function`` - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. + * - ``SPV_INTEL_subgroup_matrix_multiply_accumulate`` + - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. To enable multiple extensions, list them separated by comma. For example, to enable support for atomic operations on floating-point numbers and arbitrary precision integers, use: diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 16364ab30f280..e090fb67b3231 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -1161,9 +1161,15 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call, MachineRegisterInfo *MRI = MIRBuilder.getMRI(); if (Call->isSpirvOp()) { - if (GroupBuiltin->NoGroupOperation) + if (GroupBuiltin->NoGroupOperation) { + SmallVector ImmArgs; + if (GroupBuiltin->Opcode == + SPIRV::OpSubgroupMatrixMultiplyAccumulateINTEL && + Call->Arguments.size() > 4) + ImmArgs.push_back(getConstFromIntrinsic(Call->Arguments[4], MRI)); return buildOpFromWrapper(MIRBuilder, GroupBuiltin->Opcode, Call, - GR->getSPIRVTypeID(Call->ReturnType)); + GR->getSPIRVTypeID(Call->ReturnType), ImmArgs); + } // Group Operation is a literal Register GroupOpReg = Call->Arguments[1]; diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index b504e7b04d336..a3f27dde76b65 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -763,6 +763,7 @@ class GroupBuiltin { bit NoGroupOperation = !or(IsElect, IsAllOrAny, IsAllEqual, IsBallot, IsInverseBallot, IsBallotBitExtract, IsBallotFindBit, + !eq(operation, OpSubgroupMatrixMultiplyAccumulateINTEL), !eq(operation, OpGroupNonUniformShuffle), !eq(operation, OpGroupNonUniformShuffleXor), !eq(operation, OpGroupNonUniformShuffleUp), @@ -847,6 +848,9 @@ defm : DemangledGroupBuiltinWrapper<"__spirv_GroupNonUniformBallotFindLSB", 2, 2 defm : DemangledGroupBuiltin<"group_ballot_find_msb", OnlySub, OpGroupNonUniformBallotFindMSB>; defm : DemangledGroupBuiltinWrapper<"__spirv_GroupNonUniformBallotFindMSB", 2, 2, OpGroupNonUniformBallotFindMSB>; +// SPV_INTEL_subgroup_matrix_multiply_accumulate +defm : DemangledGroupBuiltinWrapper<"__spirv_SubgroupMatrixMultiplyAccumulateINTEL", 4, 5, OpSubgroupMatrixMultiplyAccumulateINTEL>; + // cl_khr_subgroup_shuffle defm : DemangledGroupBuiltin<"group_shuffle", OnlySub, OpGroupNonUniformShuffle>; defm : DemangledGroupBuiltinWrapper<"__spirv_GroupNonUniformShuffle", 3, 3, OpGroupNonUniformShuffle>; diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 86702bbe58f09..56cbd9414c9ee 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -93,6 +93,9 @@ static const std::map> SPIRV::Extension::Extension::SPV_INTEL_long_composites}, {"SPV_INTEL_fp_max_error", SPIRV::Extension::Extension::SPV_INTEL_fp_max_error}, + {"SPV_INTEL_subgroup_matrix_multiply_accumulate", + SPIRV::Extension::Extension:: + SPV_INTEL_subgroup_matrix_multiply_accumulate}, {"SPV_INTEL_ternary_bitwise_function", SPIRV::Extension::Extension::SPV_INTEL_ternary_bitwise_function}}; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index 53064ebb51271..6d8c84945d7d4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -736,6 +736,10 @@ def OpGroupFMax: OpGroup<"FMax", 269>; def OpGroupUMax: OpGroup<"UMax", 270>; def OpGroupSMax: OpGroup<"SMax", 271>; +def OpSubgroupMatrixMultiplyAccumulateINTEL: Op<6237, (outs ID:$res), + (ins TYPE:$ty, ID:$KDim, ID:$A, ID:$B, ID:$C, variable_ops), + "$res = OpSubgroupMatrixMultiplyAccumulateINTEL $ty $KDim $A $B $C">; + // TODO: 3.42.22. Device-Side Enqueue Instructions def OpEnqueueKernel: Op<292, (outs ID:$res), (ins TYPE:$type, ID:$queue, ID:$flags, ID:$NDR, ID:$nevents, ID:$wevents, ID:$revent, ID:$invoke, ID:$param, ID:$psize, ID:$palign, variable_ops), diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index b1e5e4328cd32..6e1c41d9f20cb 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1799,6 +1799,20 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::LongCompositesINTEL); break; } + case SPIRV::OpSubgroupMatrixMultiplyAccumulateINTEL: { + if (!ST.canUseExtension( + SPIRV::Extension::SPV_INTEL_subgroup_matrix_multiply_accumulate)) + report_fatal_error( + "OpSubgroupMatrixMultiplyAccumulateINTEL instruction requires the " + "following SPIR-V " + "extension: SPV_INTEL_subgroup_matrix_multiply_accumulate", + false); + Reqs.addExtension( + SPIRV::Extension::SPV_INTEL_subgroup_matrix_multiply_accumulate); + Reqs.addCapability( + SPIRV::Capability::SubgroupMatrixMultiplyAccumulateINTEL); + break; + } case SPIRV::OpBitwiseFunctionINTEL: { if (!ST.canUseExtension( SPIRV::Extension::SPV_INTEL_ternary_bitwise_function)) diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 0db8a37f8683c..afd3a5206926c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -314,6 +314,7 @@ defm SPV_INTEL_long_composites : ExtensionOperand<117>; defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118>; defm SPV_INTEL_fp_max_error : ExtensionOperand<119>; defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120>; +defm SPV_INTEL_subgroup_matrix_multiply_accumulate : ExtensionOperand<121>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -515,6 +516,7 @@ defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_ima defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>; defm FPMaxErrorINTEL : CapabilityOperand<6169, 0, 0, [SPV_INTEL_fp_max_error], []>; defm TernaryBitwiseFunctionINTEL : CapabilityOperand<6241, 0, 0, [SPV_INTEL_ternary_bitwise_function], []>; +defm SubgroupMatrixMultiplyAccumulateINTEL : CapabilityOperand<6236, 0, 0, [SPV_INTEL_subgroup_matrix_multiply_accumulate], []>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroup_matrix_multiply_accumulate/subgroup_matrix_multiply_accumulate_generic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroup_matrix_multiply_accumulate/subgroup_matrix_multiply_accumulate_generic.ll new file mode 100644 index 0000000000000..0cd6992936eeb --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroup_matrix_multiply_accumulate/subgroup_matrix_multiply_accumulate_generic.ll @@ -0,0 +1,229 @@ +; Adapted from Khronos Translator: subgroup_matrix_multiply_accumulate_generic.ll + +; generated with mma.cl: +; #pragma OPENCL EXTENSION cl_khr_fp16 : enable +; +; // all combinations of parameter types +; int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, int Matrix_A, int8 Matrix_B, int Matrix_C, int Operands); +; int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, int2 Matrix_A, int8 Matrix_B, int2 Matrix_C, int Operands); +; int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, int4 Matrix_A, int8 Matrix_B, int4 Matrix_C, int Operands); +; int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, int8 Matrix_A, int8 Matrix_B, int8 Matrix_C, int Operands); +; +; float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, int Matrix_A, int8 Matrix_B, float Matrix_C, int Operands); +; float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, int2 Matrix_A, int8 Matrix_B, float2 Matrix_C, int Operands); +; float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, int4 Matrix_A, int8 Matrix_B, float4 Matrix_C, int Operands); +; float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, int8 Matrix_A, int8 Matrix_B, float8 Matrix_C, int Operands); +; +; int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short Matrix_A, int8 Matrix_B, int Matrix_C, int Operands); +; int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short2 Matrix_A, int8 Matrix_B, int2 Matrix_C, int Operands); +; int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short4 Matrix_A, int8 Matrix_B, int4 Matrix_C, int Operands); +; int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short8 Matrix_A, int8 Matrix_B, int8 Matrix_C, int Operands); +; +; float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short Matrix_A, int8 Matrix_B, float Matrix_C, int Operands); +; float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short2 Matrix_A, int8 Matrix_B, float2 Matrix_C, int Operands); +; float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short4 Matrix_A, int8 Matrix_B, float4 Matrix_C, int Operands); +; float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short8 Matrix_A, int8 Matrix_B, float8 Matrix_C, int Operands); +; +; half __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short Matrix_A, int8 Matrix_B, half Matrix_C, int Operands); +; half2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short2 Matrix_A, int8 Matrix_B, half2 Matrix_C, int Operands); +; half4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short4 Matrix_A, int8 Matrix_B, half4 Matrix_C, int Operands); +; half8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short8 Matrix_A, int8 Matrix_B, half8 Matrix_C, int Operands); +; +; short __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short Matrix_A, int8 Matrix_B, short Matrix_C, int Operands); +; short2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short2 Matrix_A, int8 Matrix_B, short2 Matrix_C, int Operands); +; short4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short4 Matrix_A, int8 Matrix_B, short4 Matrix_C, int Operands); +; short8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short8 Matrix_A, int8 Matrix_B, short8 Matrix_C, int Operands); +; +; float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, float Matrix_A, float8 Matrix_B, float Matrix_C, int Operands); +; float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, float2 Matrix_A, float8 Matrix_B, float2 Matrix_C, int Operands); +; float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, float4 Matrix_A, float8 Matrix_B, float4 Matrix_C, int Operands); +; float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, float8 Matrix_A, float8 Matrix_B, float8 Matrix_C, int Operands); +; +; // no operands +; float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int K_Dim, short4 Matrix_A, int8 Matrix_B, float4 Matrix_C); +; +; void foo(int iM, int2 iM2, int4 iM4, int8 iM8, +; short sM, short2 sM2, short4 sM4, short8 sM8, +; float fM, float2 fM2, float4 fM4, float8 fM8, +; half hM, half2 hM2, half4 hM4, half8 hM8) { +; const int i = 42; +; int D = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, iM, iM8, iM, 0xA); +; int2 D2 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, iM2, iM8, iM2, 0xA); +; int4 D4 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, iM4, iM8, iM4, 0xA); +; int8 D8 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, iM8, iM8, iM8, 0xA); +; +; float fD = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, iM, iM8, fM, 0xA); +; float2 fD2 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, iM2, iM8, fM2, 0xA); +; float4 fD4 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, iM4, iM8, fM4, 0xA); +; float8 fD8 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, iM8, iM8, fM8, 0xA); +; +; int sD = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM, iM8, iM, 0xA); +; int2 sD2 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM2, iM8, iM2, 0xA); +; int4 sD4 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM4, iM8, iM4, 0xA); +; int8 sD8 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM8, iM8, iM8, 0xA); +; +; float sfD = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM, iM8, fM, 0xA); +; float2 sfD2 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM2, iM8, fM2, 0xA); +; float4 sfD4 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM4, iM8, fM4, 0xA); +; float8 sfD8 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM8, iM8, fM8, 0xA); +; +; half hD = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM, iM8, hM, 0xA); +; half2 hD2 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM2, iM8, hM2, 0xA); +; half4 hD4 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM4, iM8, hM4, 0xA); +; half8 hD8 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM8, iM8, hM8, 0xA); +; +; short ssD = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM, iM8, sM, 0xA); +; short2 ssD2 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM2, iM8, sM2, 0xA); +; short4 ssD4 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM4, iM8, sM4, 0xA); +; short8 ssD8 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM8, iM8, sM8, 0xA); +; +; float ffD = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, fM, fM8, fM, 0xA); +; float2 ffD2 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, fM2, fM8, fM2, 0xA); +; float4 ffD4 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, fM4, fM8, fM4, 0xA); +; float8 ffD8 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, fM8, fM8, fM8, 0xA); +; +; float4 noOpD4 = __spirv_SubgroupMatrixMultiplyAccumulateINTEL(i, sM4, iM8, fM4); +; } +; clang -cc1 -cl-std=clc++2021 -triple spir64-unknown-unknown -emit-llvm -finclude-default-header mma.cl -o tmp.ll + +; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; CHECK-ERROR: requires the following SPIR-V extension: SPV_INTEL_subgroup_matrix_multiply_accumulate + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_subgroup_matrix_multiply_accumulate %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_subgroup_matrix_multiply_accumulate %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_subgroup_matrix_multiply_accumulate %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_subgroup_matrix_multiply_accumulate %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability SubgroupMatrixMultiplyAccumulateINTEL +; CHECK: OpExtension "SPV_INTEL_subgroup_matrix_multiply_accumulate" +; CHECK-DAG: %[[#Int32Ty:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#Int16Ty:]] = OpTypeInt 16 0 +; CHECK-DAG: %[[#Const42:]] = OpConstant %[[#Int32Ty]] 42 +; CHECK-DAG: %[[#VoidTy:]] = OpTypeVoid +; CHECK-DAG: %[[#Vec2Int32Ty:]] = OpTypeVector %[[#Int32Ty]] 2 +; CHECK-DAG: %[[#Vec4Int32Ty:]] = OpTypeVector %[[#Int32Ty]] 4 +; CHECK-DAG: %[[#Vec8Int32Ty:]] = OpTypeVector %[[#Int32Ty]] 8 +; CHECK-DAG: %[[#Vec2Int16Ty:]] = OpTypeVector %[[#Int16Ty]] 2 +; CHECK-DAG: %[[#Vec4Int16Ty:]] = OpTypeVector %[[#Int16Ty]] 4 +; CHECK-DAG: %[[#Vec8Int16Ty:]] = OpTypeVector %[[#Int16Ty]] 8 +; CHECK-DAG: %[[#FloatTy:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#Vec2FloatTy:]] = OpTypeVector %[[#FloatTy]] 2 +; CHECK-DAG: %[[#Vec4FloatTy:]] = OpTypeVector %[[#FloatTy]] 4 +; CHECK-DAG: %[[#Vec8FloatTy:]] = OpTypeVector %[[#FloatTy]] 8 +; CHECK-DAG: %[[#HalfTy:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#Vec2HalfTy:]] = OpTypeVector %[[#HalfTy]] 2 +; CHECK-DAG: %[[#Vec4HalfTy:]] = OpTypeVector %[[#HalfTy]] 4 +; CHECK-DAG: %[[#Vec8HalfTy:]] = OpTypeVector %[[#HalfTy]] 8 +; CHECK: %[[#iM:]] = OpFunctionParameter %[[#Int32Ty]] +; CHECK: %[[#iM2:]] = OpFunctionParameter %[[#Vec2Int32Ty]] +; CHECK: %[[#iM4:]] = OpFunctionParameter %[[#Vec4Int32Ty]] +; CHECK: %[[#iM8:]] = OpFunctionParameter %[[#Vec8Int32Ty]] +; CHECK: %[[#sM:]] = OpFunctionParameter %[[#Int16Ty]] +; CHECK: %[[#sM2:]] = OpFunctionParameter %[[#Vec2Int16Ty]] +; CHECK: %[[#sM4:]] = OpFunctionParameter %[[#Vec4Int16Ty]] +; CHECK: %[[#sM8:]] = OpFunctionParameter %[[#Vec8Int16Ty]] +; CHECK: %[[#fM:]] = OpFunctionParameter %[[#FloatTy]] +; CHECK: %[[#fM2:]] = OpFunctionParameter %[[#Vec2FloatTy]] +; CHECK: %[[#fM4:]] = OpFunctionParameter %[[#Vec4FloatTy]] +; CHECK: %[[#fM8:]] = OpFunctionParameter %[[#Vec8FloatTy]] +; CHECK: %[[#hM:]] = OpFunctionParameter %[[#HalfTy]] +; CHECK: %[[#hM2:]] = OpFunctionParameter %[[#Vec2HalfTy]] +; CHECK: %[[#hM4:]] = OpFunctionParameter %[[#Vec4HalfTy]] +; CHECK: %[[#hM8:]] = OpFunctionParameter %[[#Vec8HalfTy]] +; CHECK: %[[#]] = OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Int32Ty]] %[[#Const42]] %[[#iM]] %[[#iM8]] %[[#iM]] 10 +; CHECK: %[[#]] = OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec2Int32Ty]] %[[#Const42]] %[[#iM2]] %[[#iM8]] %[[#iM2]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec4Int32Ty]] %[[#Const42]] %[[#iM4]] %[[#iM8]] %[[#iM4]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec8Int32Ty]] %[[#Const42]] %[[#iM8]] %[[#iM8]] %[[#iM8]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#FloatTy]] %[[#Const42]] %[[#iM]] %[[#iM8]] %[[#fM]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec2FloatTy]] %[[#Const42]] %[[#iM2]] %[[#iM8]] %[[#fM2]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec4FloatTy]] %[[#Const42]] %[[#iM4]] %[[#iM8]] %[[#fM4]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec8FloatTy]] %[[#Const42]] %[[#iM8]] %[[#iM8]] %[[#fM8]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Int32Ty]] %[[#Const42]] %[[#sM]] %[[#iM8]] %[[#iM]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec2Int32Ty]] %[[#Const42]] %[[#sM2]] %[[#iM8]] %[[#iM2]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec4Int32Ty]] %[[#Const42]] %[[#sM4]] %[[#iM8]] %[[#iM4]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec8Int32Ty]] %[[#Const42]] %[[#sM8]] %[[#iM8]] %[[#iM8]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#FloatTy]] %[[#Const42]] %[[#sM]] %[[#iM8]] %[[#fM]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec2FloatTy]] %[[#Const42]] %[[#sM2]] %[[#iM8]] %[[#fM2]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec4FloatTy]] %[[#Const42]] %[[#sM4]] %[[#iM8]] %[[#fM4]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec8FloatTy]] %[[#Const42]] %[[#sM8]] %[[#iM8]] %[[#fM8]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#HalfTy]] %[[#Const42]] %[[#sM]] %[[#iM8]] %[[#hM]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec2HalfTy]] %[[#Const42]] %[[#sM2]] %[[#iM8]] %[[#hM2]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec4HalfTy]] %[[#Const42]] %[[#sM4]] %[[#iM8]] %[[#hM4]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec8HalfTy]] %[[#Const42]] %[[#sM8]] %[[#iM8]] %[[#hM8]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Int16Ty]] %[[#Const42]] %[[#sM]] %[[#iM8]] %[[#sM]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec2Int16Ty]] %[[#Const42]] %[[#sM2]] %[[#iM8]] %[[#sM2]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec4Int16Ty]] %[[#Const42]] %[[#sM4]] %[[#iM8]] %[[#sM4]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec8Int16Ty]] %[[#Const42]] %[[#sM8]] %[[#iM8]] %[[#sM8]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#FloatTy]] %[[#Const42]] %[[#fM]] %[[#fM8]] %[[#fM]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec2FloatTy]] %[[#Const42]] %[[#fM2]] %[[#fM8]] %[[#fM2]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec4FloatTy]] %[[#Const42]] %[[#fM4]] %[[#fM8]] %[[#fM4]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec8FloatTy]] %[[#Const42]] %[[#fM8]] %[[#fM8]] %[[#fM8]] 10 +; CHECK: OpSubgroupMatrixMultiplyAccumulateINTEL %[[#Vec4FloatTy]] %[[#Const42]] %[[#sM4]] %[[#iM8]] %[[#fM4]] + +define spir_func void @foo(i32 %iM, <2 x i32> %iM2, <4 x i32> %iM4, <8 x i32> %iM8, + i16 signext %sM, <2 x i16> %sM2, <4 x i16> %sM4, <8 x i16> %sM8, + float %fM, <2 x float> %fM2, <4 x float> %fM4, <8 x float> %fM8, + half %hM, <2 x half> %hM2, <4 x half> %hM4, <8 x half> %hM8) { +entry: + %call = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiiDv8_iii(i32 42, i32 %iM, <8 x i32> %iM8, i32 %iM, i32 10) + %call1 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_iDv8_iS_i(i32 42, <2 x i32> %iM2, <8 x i32> %iM8, <2 x i32> %iM2, i32 10) + %call2 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_iDv8_iS_i(i32 42, <4 x i32> %iM4, <8 x i32> %iM8, <4 x i32> %iM4, i32 10) + %call3 = call spir_func <8 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_iS_S_i(i32 42, <8 x i32> %iM8, <8 x i32> %iM8, <8 x i32> %iM8, i32 10) + %call4 = call spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiiDv8_ifi(i32 42, i32 %iM, <8 x i32> %iM8, float %fM, i32 10) + %call5 = call spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_iDv8_iDv2_fi(i32 42, <2 x i32> %iM2, <8 x i32> %iM8, <2 x float> %fM2, i32 10) + %call6 = call spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_iDv8_iDv4_fi(i32 42, <4 x i32> %iM4, <8 x i32> %iM8, <4 x float> %fM4, i32 10) + %call7 = call spir_func <8 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_iS_Dv8_fi(i32 42, <8 x i32> %iM8, <8 x i32> %iM8, <8 x float> %fM8, i32 10) + %call8 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32 42, i16 signext %sM, <8 x i32> %iM8, i32 %iM, i32 10) + %call9 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv8_iDv2_ii(i32 42, <2 x i16> %sM2, <8 x i32> %iM8, <2 x i32> %iM2, i32 10) + %call10 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iDv4_ii(i32 42, <4 x i16> %sM4, <8 x i32> %iM8, <4 x i32> %iM4, i32 10) + %call11 = call spir_func <8 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iS0_i(i32 42, <8 x i16> %sM8, <8 x i32> %iM8, <8 x i32> %iM8, i32 10) + %call12 = call spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_ifi(i32 42, i16 signext %sM, <8 x i32> %iM8, float %fM, i32 10) + %call13 = call spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv8_iDv2_fi(i32 42, <2 x i16> %sM2, <8 x i32> %iM8, <2 x float> %fM2, i32 10) + %call14 = call spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iDv4_fi(i32 42, <4 x i16> %sM4, <8 x i32> %iM8, <4 x float> %fM4, i32 10) + %call15 = call spir_func <8 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iDv8_fi(i32 42, <8 x i16> %sM8, <8 x i32> %iM8, <8 x float> %fM8, i32 10) + %call16 = call spir_func half @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iDhi(i32 42, i16 signext %sM, <8 x i32> %iM8, half %hM, i32 10) + %call17 = call spir_func <2 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv8_iDv2_Dhi(i32 42, <2 x i16> %sM2, <8 x i32> %iM8, <2 x half> %hM2, i32 10) + %call18 = call spir_func <4 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iDv4_Dhi(i32 42, <4 x i16> %sM4, <8 x i32> %iM8, <4 x half> %hM4, i32 10) + %call19 = call spir_func <8 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iDv8_Dhi(i32 42, <8 x i16> %sM8, <8 x i32> %iM8, <8 x half> %hM8, i32 10) + %call20 = call spir_func signext i16 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_isi(i32 42, i16 signext %sM, <8 x i32> %iM8, i16 signext %sM, i32 10) + %call21 = call spir_func <2 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv8_iS_i(i32 42, <2 x i16> %sM2, <8 x i32> %iM8, <2 x i16> %sM2, i32 10) + %call22 = call spir_func <4 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iS_i(i32 42, <4 x i16> %sM4, <8 x i32> %iM8, <4 x i16> %sM4, i32 10) + %call23 = call spir_func <8 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iS_i(i32 42, <8 x i16> %sM8, <8 x i32> %iM8, <8 x i16> %sM8, i32 10) + %call24 = call spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELifDv8_ffi(i32 42, float %fM, <8 x float> %fM8, float %fM, i32 10) + %call25 = call spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_fDv8_fS_i(i32 42, <2 x float> %fM2, <8 x float> %fM8, <2 x float> %fM2, i32 10) + %call26 = call spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_fDv8_fS_i(i32 42, <4 x float> %fM4, <8 x float> %fM8, <4 x float> %fM4, i32 10) + %call27 = call spir_func <8 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_fS_S_i(i32 42, <8 x float> %fM8, <8 x float> %fM8, <8 x float> %fM8, i32 10) + %call28 = call spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iDv4_f(i32 42, <4 x i16> %sM4, <8 x i32> %iM8, <4 x float> %fM4) + ret void +} + +declare spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiiDv8_iii(i32, i32, <8 x i32>, i32, i32) +declare spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_iDv8_iS_i(i32, <2 x i32>, <8 x i32>, <2 x i32>, i32) +declare spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_iDv8_iS_i(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) +declare spir_func <8 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_iS_S_i(i32, <8 x i32>, <8 x i32>, <8 x i32>, i32) +declare spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiiDv8_ifi(i32, i32, <8 x i32>, float, i32) +declare spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_iDv8_iDv2_fi(i32, <2 x i32>, <8 x i32>, <2 x float>, i32) +declare spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_iDv8_iDv4_fi(i32, <4 x i32>, <8 x i32>, <4 x float>, i32) +declare spir_func <8 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_iS_Dv8_fi(i32, <8 x i32>, <8 x i32>, <8 x float>, i32) +declare spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32, i16 signext, <8 x i32>, i32, i32) +declare spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv8_iDv2_ii(i32, <2 x i16>, <8 x i32>, <2 x i32>, i32) +declare spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iDv4_ii(i32, <4 x i16>, <8 x i32>, <4 x i32>, i32) +declare spir_func <8 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iS0_i(i32, <8 x i16>, <8 x i32>, <8 x i32>, i32) +declare spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_ifi(i32, i16 signext, <8 x i32>, float, i32) +declare spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv8_iDv2_fi(i32, <2 x i16>, <8 x i32>, <2 x float>, i32) +declare spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iDv4_fi(i32, <4 x i16>, <8 x i32>, <4 x float>, i32) +declare spir_func <8 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iDv8_fi(i32, <8 x i16>, <8 x i32>, <8 x float>, i32) +declare spir_func half @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iDhi(i32, i16 signext, <8 x i32>, half, i32) +declare spir_func <2 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv8_iDv2_Dhi(i32, <2 x i16>, <8 x i32>, <2 x half>, i32) +declare spir_func <4 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iDv4_Dhi(i32, <4 x i16>, <8 x i32>, <4 x half>, i32) +declare spir_func <8 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iDv8_Dhi(i32, <8 x i16>, <8 x i32>, <8 x half>, i32) +declare spir_func signext i16 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_isi(i32, i16 signext, <8 x i32>, i16 signext, i32) +declare spir_func <2 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv8_iS_i(i32, <2 x i16>, <8 x i32>, <2 x i16>, i32) +declare spir_func <4 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iS_i(i32, <4 x i16>, <8 x i32>, <4 x i16>, i32) +declare spir_func <8 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iS_i(i32, <8 x i16>, <8 x i32>, <8 x i16>, i32) +declare spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELifDv8_ffi(i32, float, <8 x float>, float, i32) +declare spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_fDv8_fS_i(i32, <2 x float>, <8 x float>, <2 x float>, i32) +declare spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_fDv8_fS_i(i32, <4 x float>, <8 x float>, <4 x float>, i32) +declare spir_func <8 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_fS_S_i(i32, <8 x float>, <8 x float>, <8 x float>, i32) +declare spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv8_iDv4_f(i32, <4 x i16>, <8 x i32>, <4 x float>) From 15d8b3cae9debc2bd7d27ca92ff599ba9fb30da5 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 23 Apr 2025 11:29:42 +0100 Subject: [PATCH 042/245] [LLVM][ISel][AArch64 Remove AArch64ISD::FCM##z nodes. (#135817) We can easily select compare-to-zero instructions without dedicated nodes. The test changes show opportunities that were previous missed because of the redundant complexity. --- llvm/lib/CodeGen/GlobalISel/Utils.cpp | 3 +- .../Target/AArch64/AArch64ISelLowering.cpp | 37 +--------- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 7 -- .../lib/Target/AArch64/AArch64InstrFormats.td | 2 +- llvm/lib/Target/AArch64/AArch64InstrGISel.td | 36 ---------- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 19 +++-- .../GISel/AArch64PostLegalizerLowering.cpp | 70 ++++++------------- .../GlobalISel/lower-neon-vector-fcmp.mir | 68 ++++++++++++------ .../GlobalISel/select-neon-vector-fcmp.mir | 10 +-- llvm/test/CodeGen/AArch64/arm64-zip.ll | 3 +- llvm/test/CodeGen/AArch64/select_cc.ll | 5 +- 11 files changed, 93 insertions(+), 167 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 223d69c362185..d8cc86b34a819 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1385,7 +1385,8 @@ bool llvm::isBuildVectorConstantSplat(const Register Reg, const MachineRegisterInfo &MRI, int64_t SplatValue, bool AllowUndef) { if (auto SplatValAndReg = getAnyConstantSplat(Reg, MRI, AllowUndef)) - return mi_match(SplatValAndReg->VReg, MRI, m_SpecificICst(SplatValue)); + return SplatValAndReg->Value.getSExtValue() == SplatValue; + return false; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cb8f324b61187..d609303a7e79b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2595,11 +2595,6 @@ unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode( case AArch64ISD::FCMEQ: case AArch64ISD::FCMGE: case AArch64ISD::FCMGT: - case AArch64ISD::FCMEQz: - case AArch64ISD::FCMGEz: - case AArch64ISD::FCMGTz: - case AArch64ISD::FCMLEz: - case AArch64ISD::FCMLTz: // Compares return either 0 or all-ones return VTBits; case AArch64ISD::VASHR: { @@ -2816,11 +2811,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::FCMEQ) MAKE_CASE(AArch64ISD::FCMGE) MAKE_CASE(AArch64ISD::FCMGT) - MAKE_CASE(AArch64ISD::FCMEQz) - MAKE_CASE(AArch64ISD::FCMGEz) - MAKE_CASE(AArch64ISD::FCMGTz) - MAKE_CASE(AArch64ISD::FCMLEz) - MAKE_CASE(AArch64ISD::FCMLTz) MAKE_CASE(AArch64ISD::SADDV) MAKE_CASE(AArch64ISD::UADDV) MAKE_CASE(AArch64ISD::UADDLV) @@ -15829,40 +15819,19 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && "function only supposed to emit natural comparisons"); - APInt SplatValue; - APInt SplatUndef; - unsigned SplatBitSize = 0; - bool HasAnyUndefs; - - BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); - bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef, - SplatBitSize, HasAnyUndefs); - - bool IsZero = IsCnst && SplatValue == 0; - if (SrcVT.getVectorElementType().isFloatingPoint()) { switch (CC) { default: return SDValue(); case AArch64CC::NE: { - SDValue Fcmeq; - if (IsZero) - Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); - else - Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); + SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); return DAG.getNOT(dl, Fcmeq, VT); } case AArch64CC::EQ: - if (IsZero) - return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); case AArch64CC::GE: - if (IsZero) - return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); case AArch64CC::GT: - if (IsZero) - return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); case AArch64CC::LE: if (!NoNans) @@ -15870,8 +15839,6 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, // If we ignore NaNs then we can use to the LS implementation. [[fallthrough]]; case AArch64CC::LS: - if (IsZero) - return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); case AArch64CC::LT: if (!NoNans) @@ -15879,8 +15846,6 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, // If we ignore NaNs then we can use to the MI implementation. [[fallthrough]]; case AArch64CC::MI: - if (IsZero) - return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 0d51ef2be8631..adbe7e9d0a0f3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -245,13 +245,6 @@ enum NodeType : unsigned { FCMGE, FCMGT, - // Vector zero comparisons - FCMEQz, - FCMGEz, - FCMGTz, - FCMLEz, - FCMLTz, - // Round wide FP to narrow FP with inexact results to odd. FCVTXN, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 9bbcb6f3aedf5..2a0da9a1373ee 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7136,7 +7136,7 @@ multiclass SIMDCmpTwoVector opc, string asm, // FP Comparisons support only S and D element sizes (and H for v8.2a). multiclass SIMDFPCmpTwoVector opc, - string asm, SDNode OpNode> { + string asm, SDPatternOperator OpNode> { let mayRaiseFPException = 1, Uses = [FPCR] in { let Predicates = [HasNEON, HasFullFP16] in { diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td index a99019d72b4ac..7322212c5bb24 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -179,36 +179,6 @@ def G_FCMGT : AArch64GenericInstruction { let hasSideEffects = 0; } -def G_FCMEQZ : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src); - let hasSideEffects = 0; -} - -def G_FCMGEZ : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src); - let hasSideEffects = 0; -} - -def G_FCMGTZ : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src); - let hasSideEffects = 0; -} - -def G_FCMLEZ : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src); - let hasSideEffects = 0; -} - -def G_FCMLTZ : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src); - let hasSideEffects = 0; -} - def G_AARCH64_PREFETCH : AArch64GenericInstruction { let OutOperandList = (outs); let InOperandList = (ins type0:$imm, ptype0:$src1); @@ -295,12 +265,6 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; - def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index a060a2f597ccd..de7a6e6ec0088 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -882,11 +882,20 @@ def AArch64cmltz : PatFrag<(ops node:$lhs), def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS), (vnot (AArch64cmeqz (and node:$LHS, node:$RHS)))>; -def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>; -def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>; -def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>; -def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>; -def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>; +def AArch64fcmeqz : PatFrag<(ops node:$lhs), + (AArch64fcmeq node:$lhs, immAllZerosV)>; + +def AArch64fcmgez : PatFrag<(ops node:$lhs), + (AArch64fcmge node:$lhs, immAllZerosV)>; + +def AArch64fcmgtz : PatFrag<(ops node:$lhs), + (AArch64fcmgt node:$lhs, immAllZerosV)>; + +def AArch64fcmlez : PatFrag<(ops node:$lhs), + (AArch64fcmge immAllZerosV, node:$lhs)>; + +def AArch64fcmltz : PatFrag<(ops node:$lhs), + (AArch64fcmgt immAllZerosV, node:$lhs)>; def AArch64fcvtxn_n: SDNode<"AArch64ISD::FCVTXN", SDTFPRoundOp>; def AArch64fcvtxnsdr: PatFrags<(ops node:$Rn), diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 4785c7b68d94d..81ee525ed0501 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -808,16 +808,14 @@ void applyScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, bool matchBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI) { assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR); - auto Splat = getAArch64VectorSplat(MI, MRI); - if (!Splat) - return false; - if (Splat->isReg()) - return true; + // Later, during selection, we'll try to match imported patterns using // immAllOnesV and immAllZerosV. These require G_BUILD_VECTOR. Don't lower // G_BUILD_VECTORs which could match those patterns. - int64_t Cst = Splat->getCst(); - return (Cst != 0 && Cst != -1); + if (isBuildVectorAllZeros(MI, MRI) || isBuildVectorAllOnes(MI, MRI)) + return false; + + return getAArch64VectorSplat(MI, MRI).has_value(); } void applyBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -933,11 +931,10 @@ void applySwapICmpOperands(MachineInstr &MI, GISelChangeObserver &Observer) { /// \returns a function which builds a vector floating point compare instruction /// for a condition code \p CC. -/// \param [in] IsZero - True if the comparison is against 0. /// \param [in] NoNans - True if the target has NoNansFPMath. std::function -getVectorFCMP(AArch64CC::CondCode CC, Register LHS, Register RHS, bool IsZero, - bool NoNans, MachineRegisterInfo &MRI) { +getVectorFCMP(AArch64CC::CondCode CC, Register LHS, Register RHS, bool NoNans, + MachineRegisterInfo &MRI) { LLT DstTy = MRI.getType(LHS); assert(DstTy.isVector() && "Expected vector types only?"); assert(DstTy == MRI.getType(RHS) && "Src and Dst types must match!"); @@ -945,46 +942,29 @@ getVectorFCMP(AArch64CC::CondCode CC, Register LHS, Register RHS, bool IsZero, default: llvm_unreachable("Unexpected condition code!"); case AArch64CC::NE: - return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) { - auto FCmp = IsZero - ? MIB.buildInstr(AArch64::G_FCMEQZ, {DstTy}, {LHS}) - : MIB.buildInstr(AArch64::G_FCMEQ, {DstTy}, {LHS, RHS}); + return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { + auto FCmp = MIB.buildInstr(AArch64::G_FCMEQ, {DstTy}, {LHS, RHS}); return MIB.buildNot(DstTy, FCmp).getReg(0); }; case AArch64CC::EQ: - return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) { - return IsZero - ? MIB.buildInstr(AArch64::G_FCMEQZ, {DstTy}, {LHS}).getReg(0) - : MIB.buildInstr(AArch64::G_FCMEQ, {DstTy}, {LHS, RHS}) - .getReg(0); + return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { + return MIB.buildInstr(AArch64::G_FCMEQ, {DstTy}, {LHS, RHS}).getReg(0); }; case AArch64CC::GE: - return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) { - return IsZero - ? MIB.buildInstr(AArch64::G_FCMGEZ, {DstTy}, {LHS}).getReg(0) - : MIB.buildInstr(AArch64::G_FCMGE, {DstTy}, {LHS, RHS}) - .getReg(0); + return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { + return MIB.buildInstr(AArch64::G_FCMGE, {DstTy}, {LHS, RHS}).getReg(0); }; case AArch64CC::GT: - return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) { - return IsZero - ? MIB.buildInstr(AArch64::G_FCMGTZ, {DstTy}, {LHS}).getReg(0) - : MIB.buildInstr(AArch64::G_FCMGT, {DstTy}, {LHS, RHS}) - .getReg(0); + return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { + return MIB.buildInstr(AArch64::G_FCMGT, {DstTy}, {LHS, RHS}).getReg(0); }; case AArch64CC::LS: - return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) { - return IsZero - ? MIB.buildInstr(AArch64::G_FCMLEZ, {DstTy}, {LHS}).getReg(0) - : MIB.buildInstr(AArch64::G_FCMGE, {DstTy}, {RHS, LHS}) - .getReg(0); + return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { + return MIB.buildInstr(AArch64::G_FCMGE, {DstTy}, {RHS, LHS}).getReg(0); }; case AArch64CC::MI: - return [LHS, RHS, IsZero, DstTy](MachineIRBuilder &MIB) { - return IsZero - ? MIB.buildInstr(AArch64::G_FCMLTZ, {DstTy}, {LHS}).getReg(0) - : MIB.buildInstr(AArch64::G_FCMGT, {DstTy}, {RHS, LHS}) - .getReg(0); + return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { + return MIB.buildInstr(AArch64::G_FCMGT, {DstTy}, {RHS, LHS}).getReg(0); }; } } @@ -1024,23 +1004,17 @@ void applyLowerVectorFCMP(MachineInstr &MI, MachineRegisterInfo &MRI, LLT DstTy = MRI.getType(Dst); - auto Splat = getAArch64VectorSplat(*MRI.getVRegDef(RHS), MRI); - - // Compares against 0 have special target-specific pseudos. - bool IsZero = Splat && Splat->isCst() && Splat->getCst() == 0; - bool Invert = false; AArch64CC::CondCode CC, CC2 = AArch64CC::AL; if ((Pred == CmpInst::Predicate::FCMP_ORD || Pred == CmpInst::Predicate::FCMP_UNO) && - IsZero) { + isBuildVectorAllZeros(*MRI.getVRegDef(RHS), MRI)) { // The special case "fcmp ord %a, 0" is the canonical check that LHS isn't // NaN, so equivalent to a == a and doesn't need the two comparisons an // "ord" normally would. // Similarly, "fcmp uno %a, 0" is the canonical check that LHS is NaN and is // thus equivalent to a != a. RHS = LHS; - IsZero = false; CC = Pred == CmpInst::Predicate::FCMP_ORD ? AArch64CC::EQ : AArch64CC::NE; } else changeVectorFCMPPredToAArch64CC(Pred, CC, CC2, Invert); @@ -1051,12 +1025,12 @@ void applyLowerVectorFCMP(MachineInstr &MI, MachineRegisterInfo &MRI, const bool NoNans = ST.getTargetLowering()->getTargetMachine().Options.NoNaNsFPMath; - auto Cmp = getVectorFCMP(CC, LHS, RHS, IsZero, NoNans, MRI); + auto Cmp = getVectorFCMP(CC, LHS, RHS, NoNans, MRI); Register CmpRes; if (CC2 == AArch64CC::AL) CmpRes = Cmp(MIB); else { - auto Cmp2 = getVectorFCMP(CC2, LHS, RHS, IsZero, NoNans, MRI); + auto Cmp2 = getVectorFCMP(CC2, LHS, RHS, NoNans, MRI); auto Cmp2Dst = Cmp2(MIB); auto Cmp1Dst = Cmp(MIB); CmpRes = MIB.buildOr(DstTy, Cmp1Dst, Cmp2Dst).getReg(0); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/lower-neon-vector-fcmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/lower-neon-vector-fcmp.mir index 1f5fb892df582..591dafc1ad098 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/lower-neon-vector-fcmp.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/lower-neon-vector-fcmp.mir @@ -37,8 +37,10 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMEQZ:%[0-9]+]]:_(<2 x s64>) = G_FCMEQZ %lhs - ; CHECK-NEXT: $q0 = COPY [[FCMEQZ]](<2 x s64>) + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMEQ:%[0-9]+]]:_(<2 x s64>) = G_FCMEQ %lhs, %zero_vec(<2 x s64>) + ; CHECK-NEXT: $q0 = COPY [[FCMEQ]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 %zero:_(s64) = G_CONSTANT i64 0 @@ -82,8 +84,10 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMGTZ:%[0-9]+]]:_(<2 x s64>) = G_FCMGTZ %lhs - ; CHECK-NEXT: $q0 = COPY [[FCMGTZ]](<2 x s64>) + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMGT:%[0-9]+]]:_(<2 x s64>) = G_FCMGT %lhs, %zero_vec(<2 x s64>) + ; CHECK-NEXT: $q0 = COPY [[FCMGT]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 %zero:_(s64) = G_CONSTANT i64 0 @@ -129,8 +133,10 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMGEZ:%[0-9]+]]:_(<2 x s64>) = G_FCMGEZ %lhs - ; CHECK-NEXT: $q0 = COPY [[FCMGEZ]](<2 x s64>) + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMGE:%[0-9]+]]:_(<2 x s64>) = G_FCMGE %lhs, %zero_vec(<2 x s64>) + ; CHECK-NEXT: $q0 = COPY [[FCMGE]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 %zero:_(s64) = G_CONSTANT i64 0 @@ -174,8 +180,10 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMLTZ:%[0-9]+]]:_(<2 x s64>) = G_FCMLTZ %lhs - ; CHECK-NEXT: $q0 = COPY [[FCMLTZ]](<2 x s64>) + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMGT:%[0-9]+]]:_(<2 x s64>) = G_FCMGT %zero_vec, %lhs(<2 x s64>) + ; CHECK-NEXT: $q0 = COPY [[FCMGT]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 %zero:_(s64) = G_CONSTANT i64 0 @@ -218,8 +226,10 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMLEZ:%[0-9]+]]:_(<2 x s64>) = G_FCMLEZ %lhs - ; CHECK-NEXT: $q0 = COPY [[FCMLEZ]](<2 x s64>) + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMGE:%[0-9]+]]:_(<2 x s64>) = G_FCMGE %zero_vec, %lhs(<2 x s64>) + ; CHECK-NEXT: $q0 = COPY [[FCMGE]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 %zero:_(s64) = G_CONSTANT i64 0 @@ -270,9 +280,11 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMGTZ:%[0-9]+]]:_(<2 x s64>) = G_FCMGTZ %lhs - ; CHECK-NEXT: [[FCMLTZ:%[0-9]+]]:_(<2 x s64>) = G_FCMLTZ %lhs - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s64>) = G_OR [[FCMLTZ]], [[FCMGTZ]] + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMGT:%[0-9]+]]:_(<2 x s64>) = G_FCMGT %lhs, %zero_vec(<2 x s64>) + ; CHECK-NEXT: [[FCMGT1:%[0-9]+]]:_(<2 x s64>) = G_FCMGT %zero_vec, %lhs(<2 x s64>) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s64>) = G_OR [[FCMGT1]], [[FCMGT]] ; CHECK-NEXT: $q0 = COPY [[OR]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 @@ -434,10 +446,12 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMGEZ:%[0-9]+]]:_(<2 x s64>) = G_FCMGEZ %lhs + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMGE:%[0-9]+]]:_(<2 x s64>) = G_FCMGE %lhs, %zero_vec(<2 x s64>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMGEZ]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMGE]], [[BUILD_VECTOR]] ; CHECK-NEXT: $q0 = COPY [[XOR]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 @@ -490,10 +504,12 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMGTZ:%[0-9]+]]:_(<2 x s64>) = G_FCMGTZ %lhs + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMGT:%[0-9]+]]:_(<2 x s64>) = G_FCMGT %lhs, %zero_vec(<2 x s64>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMGTZ]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMGT]], [[BUILD_VECTOR]] ; CHECK-NEXT: $q0 = COPY [[XOR]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 @@ -546,10 +562,12 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMLEZ:%[0-9]+]]:_(<2 x s64>) = G_FCMLEZ %lhs + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMGE:%[0-9]+]]:_(<2 x s64>) = G_FCMGE %zero_vec, %lhs(<2 x s64>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMLEZ]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMGE]], [[BUILD_VECTOR]] ; CHECK-NEXT: $q0 = COPY [[XOR]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 @@ -602,10 +620,12 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMLTZ:%[0-9]+]]:_(<2 x s64>) = G_FCMLTZ %lhs + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMGT:%[0-9]+]]:_(<2 x s64>) = G_FCMGT %zero_vec, %lhs(<2 x s64>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMLTZ]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMGT]], [[BUILD_VECTOR]] ; CHECK-NEXT: $q0 = COPY [[XOR]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 @@ -658,10 +678,12 @@ body: | ; CHECK: liveins: $q0, $q1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 - ; CHECK-NEXT: [[FCMEQZ:%[0-9]+]]:_(<2 x s64>) = G_FCMEQZ %lhs + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: [[FCMEQ:%[0-9]+]]:_(<2 x s64>) = G_FCMEQ %lhs, %zero_vec(<2 x s64>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMEQZ]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[FCMEQ]], [[BUILD_VECTOR]] ; CHECK-NEXT: $q0 = COPY [[XOR]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lhs:_(<2 x s64>) = COPY $q0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-neon-vector-fcmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-neon-vector-fcmp.mir index 0b0c3ed763abc..daf84b5cf07e9 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-neon-vector-fcmp.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-neon-vector-fcmp.mir @@ -77,7 +77,7 @@ body: | %lhs:fpr(<2 x s64>) = COPY $q0 %zero:gpr(s64) = G_CONSTANT i64 0 %zero_vec:fpr(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) - %fcmp:fpr(<2 x s64>) = G_FCMEQZ %lhs + %fcmp:fpr(<2 x s64>) = G_FCMEQ %lhs, %zero_vec(<2 x s64>) $q0 = COPY %fcmp(<2 x s64>) RET_ReallyLR implicit $q0 @@ -97,7 +97,7 @@ body: | %lhs:fpr(<2 x s64>) = COPY $q0 %zero:gpr(s64) = G_CONSTANT i64 0 %zero_vec:fpr(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) - %fcmp:fpr(<2 x s64>) = G_FCMGEZ %lhs + %fcmp:fpr(<2 x s64>) = G_FCMGE %lhs, %zero_vec(<2 x s64>) $q0 = COPY %fcmp(<2 x s64>) RET_ReallyLR implicit $q0 @@ -117,7 +117,7 @@ body: | %lhs:fpr(<2 x s64>) = COPY $q0 %zero:gpr(s64) = G_CONSTANT i64 0 %zero_vec:fpr(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) - %fcmp:fpr(<2 x s64>) = G_FCMGTZ %lhs + %fcmp:fpr(<2 x s64>) = G_FCMGT %lhs, %zero_vec(<2 x s64>) $q0 = COPY %fcmp(<2 x s64>) RET_ReallyLR implicit $q0 @@ -137,7 +137,7 @@ body: | %lhs:fpr(<2 x s64>) = COPY $q0 %zero:gpr(s64) = G_CONSTANT i64 0 %zero_vec:fpr(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) - %fcmp:fpr(<2 x s64>) = G_FCMLEZ %lhs + %fcmp:fpr(<2 x s64>) = G_FCMGE %zero_vec(<2 x s64>), %lhs $q0 = COPY %fcmp(<2 x s64>) RET_ReallyLR implicit $q0 @@ -157,6 +157,6 @@ body: | %lhs:fpr(<2 x s64>) = COPY $q0 %zero:gpr(s64) = G_CONSTANT i64 0 %zero_vec:fpr(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) - %fcmp:fpr(<2 x s64>) = G_FCMLTZ %lhs + %fcmp:fpr(<2 x s64>) = G_FCMGT %zero_vec(<2 x s64>), %lhs $q0 = COPY %fcmp(<2 x s64>) RET_ReallyLR implicit $q0 diff --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll index 9955b253f563e..b24e54a68fb42 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zip.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll @@ -405,8 +405,7 @@ define <16 x i8> @combine_v8i16_8firstundef(<8 x i8> %0, <8 x i8> %1) { define <4 x float> @shuffle_zip1(<4 x float> %arg) { ; CHECK-LABEL: shuffle_zip1: ; CHECK: // %bb.0: // %bb -; CHECK-NEXT: movi.2d v1, #0000000000000000 -; CHECK-NEXT: fcmgt.4s v0, v0, v1 +; CHECK-NEXT: fcmgt.4s v0, v0, #0.0 ; CHECK-NEXT: uzp1.8h v1, v0, v0 ; CHECK-NEXT: xtn.4h v0, v0 ; CHECK-NEXT: xtn.4h v1, v1 diff --git a/llvm/test/CodeGen/AArch64/select_cc.ll b/llvm/test/CodeGen/AArch64/select_cc.ll index 6feaabe85d9ba..66dbd4ed5a4dc 100644 --- a/llvm/test/CodeGen/AArch64/select_cc.ll +++ b/llvm/test/CodeGen/AArch64/select_cc.ll @@ -85,9 +85,8 @@ entry: define <2 x double> @select_olt_load_cmp(<2 x double> %a, ptr %src) { ; CHECK-SD-LABEL: select_olt_load_cmp: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: movi d1, #0000000000000000 -; CHECK-SD-NEXT: ldr d2, [x0] -; CHECK-SD-NEXT: fcmgt v1.2s, v2.2s, v1.2s +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: fcmgt v1.2s, v1.2s, #0.0 ; CHECK-SD-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: ret From 37e8c6c6ee7c809e45d0e5b61c601a0bb91ca1c4 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Wed, 23 Apr 2025 13:32:59 +0300 Subject: [PATCH 043/245] [BOLT] Do not return Def-ed registers from MCPlusBuilder::getUsedRegs (#129890) Update the implementation of `MCPlusBuilder::getUsedRegs` to match its description in the header file, add unit tests. --- bolt/lib/Core/MCPlusBuilder.cpp | 6 +- bolt/unittests/Core/MCPlusBuilder.cpp | 140 ++++++++++++++++++++++---- 2 files changed, 125 insertions(+), 21 deletions(-) diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index a3be147a09066..7752079b61538 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -442,10 +442,10 @@ void MCPlusBuilder::getUsedRegs(const MCInst &Inst, BitVector &Regs) const { for (MCPhysReg ImplicitUse : InstInfo.implicit_uses()) Regs |= getAliases(ImplicitUse, /*OnlySmaller=*/true); - for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { - if (!Inst.getOperand(I).isReg()) + for (const MCOperand &Operand : useOperands(Inst)) { + if (!Operand.isReg()) continue; - Regs |= getAliases(Inst.getOperand(I).getReg(), /*OnlySmaller=*/true); + Regs |= getAliases(Operand.getReg(), /*OnlySmaller=*/true); } } diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp index a3113cab3d334..7016dec0e3574 100644 --- a/bolt/unittests/Core/MCPlusBuilder.cpp +++ b/bolt/unittests/Core/MCPlusBuilder.cpp @@ -8,6 +8,7 @@ #ifdef AARCH64_AVAILABLE #include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" #endif // AARCH64_AVAILABLE #ifdef X86_AVAILABLE @@ -19,6 +20,7 @@ #include "bolt/Rewrite/RewriteInstance.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/Support/TargetSelect.h" #include "gtest/gtest.h" @@ -70,16 +72,28 @@ struct MCPlusBuilderTester : public testing::TestWithParam { BC->MRI.get(), BC->STI.get()))); } + void assertRegMask(const BitVector &RegMask, + std::initializer_list ExpectedRegs) { + ASSERT_EQ(RegMask.count(), ExpectedRegs.size()); + for (MCPhysReg Reg : ExpectedRegs) + ASSERT_TRUE(RegMask[Reg]) << "Expected " << BC->MRI->getName(Reg) << "."; + } + + void assertRegMask(std::function FillRegMask, + std::initializer_list ExpectedRegs) { + BitVector RegMask(BC->MRI->getNumRegs()); + FillRegMask(RegMask); + assertRegMask(RegMask, ExpectedRegs); + } + void testRegAliases(Triple::ArchType Arch, uint64_t Register, - uint64_t *Aliases, size_t Count, + std::initializer_list ExpectedAliases, bool OnlySmaller = false) { if (GetParam() != Arch) GTEST_SKIP(); const BitVector &BV = BC->MIB->getAliases(Register, OnlySmaller); - ASSERT_EQ(BV.count(), Count); - for (size_t I = 0; I < Count; ++I) - ASSERT_TRUE(BV[Aliases[I]]); + assertRegMask(BV, ExpectedAliases); } char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {}; @@ -94,17 +108,15 @@ INSTANTIATE_TEST_SUITE_P(AArch64, MCPlusBuilderTester, ::testing::Values(Triple::aarch64)); TEST_P(MCPlusBuilderTester, AliasX0) { - uint64_t AliasesX0[] = {AArch64::W0, AArch64::W0_HI, - AArch64::X0, AArch64::W0_W1, - AArch64::X0_X1, AArch64::X0_X1_X2_X3_X4_X5_X6_X7}; - size_t AliasesX0Count = sizeof(AliasesX0) / sizeof(*AliasesX0); - testRegAliases(Triple::aarch64, AArch64::X0, AliasesX0, AliasesX0Count); + testRegAliases(Triple::aarch64, AArch64::X0, + {AArch64::W0, AArch64::W0_HI, AArch64::X0, AArch64::W0_W1, + AArch64::X0_X1, AArch64::X0_X1_X2_X3_X4_X5_X6_X7}); } TEST_P(MCPlusBuilderTester, AliasSmallerX0) { - uint64_t AliasesX0[] = {AArch64::W0, AArch64::W0_HI, AArch64::X0}; - size_t AliasesX0Count = sizeof(AliasesX0) / sizeof(*AliasesX0); - testRegAliases(Triple::aarch64, AArch64::X0, AliasesX0, AliasesX0Count, true); + testRegAliases(Triple::aarch64, AArch64::X0, + {AArch64::W0, AArch64::W0_HI, AArch64::X0}, + /*OnlySmaller=*/true); } TEST_P(MCPlusBuilderTester, AArch64_CmpJE) { @@ -155,6 +167,100 @@ TEST_P(MCPlusBuilderTester, AArch64_CmpJNE) { ASSERT_EQ(Label, BB->getLabel()); } +TEST_P(MCPlusBuilderTester, testAccessedRegsImplicitDef) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + + // adds x0, x5, #42 + MCInst Inst = MCInstBuilder(AArch64::ADDSXri) + .addReg(AArch64::X0) + .addReg(AArch64::X5) + .addImm(42) + .addImm(0); + + assertRegMask([&](BitVector &BV) { BC->MIB->getClobberedRegs(Inst, BV); }, + {AArch64::NZCV, AArch64::W0, AArch64::X0, AArch64::W0_HI, + AArch64::X0_X1_X2_X3_X4_X5_X6_X7, AArch64::W0_W1, + AArch64::X0_X1}); + + assertRegMask( + [&](BitVector &BV) { BC->MIB->getTouchedRegs(Inst, BV); }, + {AArch64::NZCV, AArch64::W0, AArch64::W5, AArch64::X0, AArch64::X5, + AArch64::W0_HI, AArch64::W5_HI, AArch64::X0_X1_X2_X3_X4_X5_X6_X7, + AArch64::X2_X3_X4_X5_X6_X7_X8_X9, AArch64::X4_X5_X6_X7_X8_X9_X10_X11, + AArch64::W0_W1, AArch64::W4_W5, AArch64::X0_X1, AArch64::X4_X5}); + + assertRegMask([&](BitVector &BV) { BC->MIB->getWrittenRegs(Inst, BV); }, + {AArch64::NZCV, AArch64::W0, AArch64::X0, AArch64::W0_HI}); + + assertRegMask([&](BitVector &BV) { BC->MIB->getUsedRegs(Inst, BV); }, + {AArch64::W5, AArch64::X5, AArch64::W5_HI}); + + assertRegMask([&](BitVector &BV) { BC->MIB->getSrcRegs(Inst, BV); }, + {AArch64::W5, AArch64::X5, AArch64::W5_HI}); +} + +TEST_P(MCPlusBuilderTester, testAccessedRegsImplicitUse) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + + // b.eq