Merge master

scientificware · Oct 13, 2023 · 9898fed · 9898fed
2 parents 76bc84d + 1082c0e
commit 9898fed
Show file tree

Hide file tree

Showing 391 changed files with 5,673 additions and 3,926 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -146,8 +146,8 @@ jobs:
       apt-architecture: 'i386'
       # Some multilib libraries do not have proper inter-dependencies, so we have to
       # install their dependencies manually.
-      apt-extra-packages: 'libfreetype-dev:i386 libtiff-dev:i386 libcupsimage2-dev:i386 libc6-i386 libgcc-s1:i386 libstdc++6:i386'
-      extra-conf-options: '--with-target-bits=32'
+      apt-extra-packages: 'libfreetype-dev:i386 libtiff-dev:i386 libcupsimage2-dev:i386 libc6-i386 libgcc-s1:i386 libstdc++6:i386 libffi-dev:i386'
+      extra-conf-options: '--with-target-bits=32 --enable-fallback-linker --enable-libffi-bundling'
       configure-arguments: ${{ github.event.inputs.configure-arguments }}
       make-arguments: ${{ github.event.inputs.make-arguments }}
     if: needs.select.outputs.linux-x86 == 'true'

diff --git a/make/conf/jib-profiles.js b/make/conf/jib-profiles.js
@@ -426,9 +426,14 @@ var getJibProfilesProfiles = function (input, common, data) {
             target_os: "linux",
             target_cpu: "x86",
             build_cpu: "x64",
-            dependencies: ["devkit", "gtest"],
-            configure_args: concat(common.configure_args_32bit,
-                "--with-jvm-variants=minimal,server", "--with-zlib=system"),
+            dependencies: ["devkit", "gtest", "libffi"],
+            configure_args: concat(common.configure_args_32bit, [
+                "--with-jvm-variants=minimal,server",
+                "--with-zlib=system",
+                "--with-libffi=" + input.get("libffi", "home_path"),
+                "--enable-libffi-bundling",
+                "--enable-fallback-linker"
+            ])
         },
 
         "macosx-x64": {

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
@@ -227,7 +227,7 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true)
       NAME := fallbackLinker, \
       CFLAGS := $(CFLAGS_JDKLIB) $(LIBFFI_CFLAGS), \
       LDFLAGS := $(LDFLAGS_JDKLIB) \
-                  $(call SET_SHARED_LIBRARY_ORIGIN), \
+                 $(call SET_SHARED_LIBRARY_ORIGIN), \
       LIBS := $(LIBFFI_LIBS), \
       LIBS_windows := $(LIBFFI_LIBS) ws2_32.lib, \
   ))

diff --git a/make/test/BuildMicrobenchmark.gmk b/make/test/BuildMicrobenchmark.gmk
@@ -84,7 +84,9 @@ $(eval $(call SetupJavaCompilation, BUILD_INDIFY, \
 #### Compile Targets
 
 # Building microbenchmark requires the jdk.unsupported and java.management modules.
-# sun.security.util is required to compile Cache benchmark
+# sun.security.util is required to compile Cache benchmark.
+# jmh uses annotation processors to generate the benchmark jar and thus
+# requires the use of -processor option during benchmark compilation.
 
 # Build microbenchmark suite for the current JDK
 $(eval $(call SetupJavaCompilation, BUILD_JDK_MICROBENCHMARK, \
@@ -106,7 +108,8 @@ $(eval $(call SetupJavaCompilation, BUILD_JDK_MICROBENCHMARK, \
         --add-exports java.base/jdk.internal.org.objectweb.asm.tree=ALL-UNNAMED \
         --add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
         --add-exports java.base/jdk.internal.event=ALL-UNNAMED \
-        --enable-preview, \
+        --enable-preview \
+        -processor org.openjdk.jmh.generators.BenchmarkProcessor, \
     JAVA_FLAGS := --add-modules jdk.unsupported --limit-modules java.management \
         --add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
         --enable-preview, \

diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp
@@ -1790,6 +1790,11 @@ enum Nf {
   INSN(vlse32_v, 0b0000111, 0b110, 0b10, 0b0);
   INSN(vlse64_v, 0b0000111, 0b111, 0b10, 0b0);
 
+  INSN(vsse8_v,  0b0100111, 0b000, 0b10, 0b0);
+  INSN(vsse16_v, 0b0100111, 0b101, 0b10, 0b0);
+  INSN(vsse32_v, 0b0100111, 0b110, 0b10, 0b0);
+  INSN(vsse64_v, 0b0100111, 0b111, 0b10, 0b0);
+
 #undef INSN
 #undef patch_VLdSt
 

diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
@@ -1289,6 +1289,13 @@ class MacroAssembler: public Assembler {
   }
 
   // vector pseudo instructions
+  // rotate vector register left with shift bits, 32-bit version
+  inline void vrole32_vi(VectorRegister vd, uint32_t shift, VectorRegister tmp_vr) {
+    vsrl_vi(tmp_vr, vd, 32 - shift);
+    vsll_vi(vd, vd, shift);
+    vor_vv(vd, vd, tmp_vr);
+  }
+
   inline void vl1r_v(VectorRegister vd, Register rs) {
     vl1re8_v(vd, rs);
   }

diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
@@ -4277,6 +4277,142 @@ class StubGenerator: public StubCodeGenerator {
     return (address) start;
   }
 
+  /**
+   * Perform the quarter round calculations on values contained within four vector registers.
+   *
+   * @param aVec the SIMD register containing only the "a" values
+   * @param bVec the SIMD register containing only the "b" values
+   * @param cVec the SIMD register containing only the "c" values
+   * @param dVec the SIMD register containing only the "d" values
+   * @param tmp_vr temporary vector register holds intermedia values.
+   */
+  void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
+                          VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
+    // a += b, d ^= a, d <<<= 16
+    __ vadd_vv(aVec, aVec, bVec);
+    __ vxor_vv(dVec, dVec, aVec);
+    __ vrole32_vi(dVec, 16, tmp_vr);
+
+    // c += d, b ^= c, b <<<= 12
+    __ vadd_vv(cVec, cVec, dVec);
+    __ vxor_vv(bVec, bVec, cVec);
+    __ vrole32_vi(bVec, 12, tmp_vr);
+
+    // a += b, d ^= a, d <<<= 8
+    __ vadd_vv(aVec, aVec, bVec);
+    __ vxor_vv(dVec, dVec, aVec);
+    __ vrole32_vi(dVec, 8, tmp_vr);
+
+    // c += d, b ^= c, b <<<= 7
+    __ vadd_vv(cVec, cVec, dVec);
+    __ vxor_vv(bVec, bVec, cVec);
+    __ vrole32_vi(bVec, 7, tmp_vr);
+  }
+
+  /**
+   * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
+   *
+   *  Input arguments:
+   *  c_rarg0   - state, the starting state
+   *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
+   *
+   *  Implementation Note:
+   *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
+   *   N depends on single vector register length.
+   */
+  address generate_chacha20Block() {
+    Label L_Rounds;
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "chacha20Block");
+    address start = __ pc();
+    __ enter();
+
+    const int states_len = 16;
+    const int step = 4;
+    const Register state = c_rarg0;
+    const Register key_stream = c_rarg1;
+    const Register tmp_addr = t0;
+    const Register length = t1;
+
+    // Organize vector registers in an array that facilitates
+    // putting repetitive opcodes into loop structures below.
+    const VectorRegister work_vrs[16] = {
+      v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
+      v8, v9, v10, v11, v12, v13, v14, v15
+    };
+    const VectorRegister tmp_vr = v16;
+    const VectorRegister counter_vr = v17;
+
+    {
+      // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
+      // in java level.
+      __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
+    }
+
+    // Load from source state.
+    // Every element in source state is duplicated to all elements in the corresponding vector.
+    __ mv(tmp_addr, state);
+    for (int i = 0; i < states_len; i += 1) {
+      __ vlse32_v(work_vrs[i], tmp_addr, zr);
+      __ addi(tmp_addr, tmp_addr, step);
+    }
+    // Adjust counter for every individual block.
+    __ vid_v(counter_vr);
+    __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
+
+    // Perform 10 iterations of the 8 quarter round set
+    {
+      const Register loop = t2; // share t2 with other non-overlapping usages.
+      __ mv(loop, 10);
+      __ BIND(L_Rounds);
+
+      chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
+      chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
+      chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
+      chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
+
+      chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
+      chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
+      chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
+      chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
+
+      __ sub(loop, loop, 1);
+      __ bnez(loop, L_Rounds);
+    }
+
+    // Add the original state into the end working state.
+    // We do this by first duplicating every element in source state array to the corresponding
+    // vector, then adding it to the post-loop working state.
+    __ mv(tmp_addr, state);
+    for (int i = 0; i < states_len; i += 1) {
+      __ vlse32_v(tmp_vr, tmp_addr, zr);
+      __ addi(tmp_addr, tmp_addr, step);
+      __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
+    }
+    // Add the counter overlay onto work_vrs[12] at the end.
+    __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
+
+    // Store result to key stream.
+    {
+      const Register stride = t2; // share t2 with other non-overlapping usages.
+      // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
+      __ mv(stride, 64);
+      for (int i = 0; i < states_len; i += 1) {
+        __ vsse32_v(work_vrs[i], key_stream, stride);
+        __ addi(key_stream, key_stream, step);
+      }
+    }
+
+    // Return length of output key_stream
+    __ slli(c_rarg0, length, 6);
+
+    __ leave();
+    __ ret();
+
+    return (address) start;
+  }
+
 #if INCLUDE_JFR
 
   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
@@ -4496,6 +4632,11 @@ class StubGenerator: public StubCodeGenerator {
       StubRoutines::_md5_implCompress   = generate_md5_implCompress(false, "md5_implCompress");
       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true,  "md5_implCompressMB");
     }
+
+    if (UseChaCha20Intrinsics) {
+      StubRoutines::_chacha20Block = generate_chacha20Block();
+    }
+
 #endif // COMPILER2_OR_JVMCI
   }
 

diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp
@@ -253,6 +253,16 @@ void VM_Version::initialize() {
     warning("Block zeroing is not available");
     FLAG_SET_DEFAULT(UseBlockZeroing, false);
   }
+  if (UseRVV) {
+    if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
+      FLAG_SET_DEFAULT(UseChaCha20Intrinsics, true);
+    }
+  } else if (UseChaCha20Intrinsics) {
+    if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
+      warning("Chacha20 intrinsic requires RVV instructions (not available on this CPU)");
+    }
+    FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
+  }
 
 #ifdef COMPILER2
   c2_initialize();

diff --git a/src/hotspot/cpu/s390/assembler_s390.hpp b/src/hotspot/cpu/s390/assembler_s390.hpp
@@ -140,7 +140,8 @@ class RelAddr {
     if ((target == nullptr) || (target == pc)) {
       return 0;  // Yet unknown branch destination.
     } else {
-      guarantee(is_in_range_of_RelAddr(target, pc, shortForm), "target not within reach");
+      guarantee(is_in_range_of_RelAddr(target, pc, shortForm),
+                "target not within reach at " INTPTR_FORMAT ", distance = " INTX_FORMAT, p2i(pc), (target - pc) );
       return (int)((target - pc)>>1);
     }
   }

diff --git a/src/hotspot/cpu/s390/c1_MacroAssembler_s390.cpp b/src/hotspot/cpu/s390/c1_MacroAssembler_s390.cpp
@@ -101,7 +101,7 @@ void C1_MacroAssembler::lock_object(Register Rmark, Register Roop, Register Rbox
   if (DiagnoseSyncOnValueBasedClasses != 0) {
     load_klass(tmp, Roop);
     testbit(Address(tmp, Klass::access_flags_offset()), exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
-    z_btrue(slow_case);
+    branch_optimized(Assembler::bcondAllOne, slow_case);
   }
 
   assert(LockingMode != LM_MONITOR, "LM_MONITOR is already handled, by emit_lock()");
@@ -170,7 +170,7 @@ void C1_MacroAssembler::unlock_object(Register Rmark, Register Roop, Register Rb
     z_lg(Rmark, Address(Roop, hdr_offset));
     z_lgr(tmp, Rmark);
     z_nill(tmp, markWord::monitor_value);
-    z_brnz(slow_case);
+    branch_optimized(Assembler::bcondNotZero, slow_case);
     lightweight_unlock(Roop, Rmark, tmp, slow_case);
   } else if (LockingMode == LM_LEGACY) {
     // Test if object header is pointing to the displaced header, and if so, restore

diff --git a/src/hotspot/cpu/s390/interp_masm_s390.cpp b/src/hotspot/cpu/s390/interp_masm_s390.cpp
@@ -36,6 +36,7 @@
 #include "oops/markWord.hpp"
 #include "oops/methodCounters.hpp"
 #include "oops/methodData.hpp"
+#include "oops/resolvedFieldEntry.hpp"
 #include "oops/resolvedIndyEntry.hpp"
 #include "prims/jvmtiExport.hpp"
 #include "prims/jvmtiThreadState.hpp"
@@ -349,16 +350,45 @@ void InterpreterMacroAssembler::get_cache_and_index_at_bcp(Register cache, Regis
 }
 
 void InterpreterMacroAssembler::load_resolved_indy_entry(Register cache, Register index) {
-  // Get index out of bytecode pointer, get_cache_entry_pointer_at_bcp
+  // Get index out of bytecode pointer.
   get_cache_index_at_bcp(index, 1, sizeof(u4));
-  // Get address of invokedynamic array
+
+  // Get the address of the ResolvedIndyEntry array
   get_constant_pool_cache(cache);
   z_lg(cache, Address(cache, in_bytes(ConstantPoolCache::invokedynamic_entries_offset())));
-  // Scale the index to be the entry index * sizeof(ResolvedInvokeDynamicInfo)
-  z_sllg(index, index, exact_log2(sizeof(ResolvedIndyEntry)));
+
+  // Scale the index to form a byte offset into the ResolvedIndyEntry array
+  size_t entry_size = sizeof(ResolvedIndyEntry);
+  if (is_power_of_2(entry_size)) {
+    z_sllg(index, index, exact_log2(entry_size));
+  } else {
+    z_mghi(index, entry_size);
+  }
+
+  // Calculate the final field address.
   z_la(cache, Array<ResolvedIndyEntry>::base_offset_in_bytes(), index, cache);
 }
 
+void InterpreterMacroAssembler::load_field_entry(Register cache, Register index, int bcp_offset) {
+  // Get field index out of bytecode pointer.
+  get_cache_index_at_bcp(index, bcp_offset, sizeof(u2));
+
+  // Get the address of the ResolvedFieldEntry array.
+  get_constant_pool_cache(cache);
+  z_lg(cache, Address(cache, in_bytes(ConstantPoolCache::field_entries_offset())));
+
+  // Scale the index to form a byte offset into the ResolvedFieldEntry array
+  size_t entry_size = sizeof(ResolvedFieldEntry);
+  if (is_power_of_2(entry_size)) {
+    z_sllg(index, index, exact_log2(entry_size));
+  } else {
+    z_mghi(index, entry_size);
+  }
+
+  // Calculate the final field address.
+  z_la(cache, Array<ResolvedFieldEntry>::base_offset_in_bytes(), index, cache);
+}
+
 // Kills Z_R0_scratch.
 void InterpreterMacroAssembler::get_cache_and_index_and_bytecode_at_bcp(Register cache,
                                                                         Register cpe_offset,

diff --git a/src/hotspot/cpu/s390/interp_masm_s390.hpp b/src/hotspot/cpu/s390/interp_masm_s390.hpp
@@ -113,6 +113,7 @@ class InterpreterMacroAssembler: public MacroAssembler {
 
   void get_cache_and_index_at_bcp(Register cache, Register cpe_offset, int bcp_offset, size_t index_size = sizeof(u2));
   void load_resolved_indy_entry(Register cache, Register index);
+  void load_field_entry(Register cache, Register index, int bcp_offset = 1);
   void get_cache_and_index_and_bytecode_at_bcp(Register cache, Register cpe_offset, Register bytecode,
                                                int byte_no, int bcp_offset, size_t index_size = sizeof(u2));
   void get_cache_entry_pointer_at_bcp(Register cache, Register tmp, int bcp_offset, size_t index_size = sizeof(u2));