Skip to content

Commit

Permalink
Merge master
Browse files Browse the repository at this point in the history
  • Loading branch information
scientificware committed Oct 13, 2023
2 parents 76bc84d + 1082c0e commit 9898fed
Show file tree
Hide file tree
Showing 391 changed files with 5,673 additions and 3,926 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -146,8 +146,8 @@ jobs:
apt-architecture: 'i386'
# Some multilib libraries do not have proper inter-dependencies, so we have to
# install their dependencies manually.
apt-extra-packages: 'libfreetype-dev:i386 libtiff-dev:i386 libcupsimage2-dev:i386 libc6-i386 libgcc-s1:i386 libstdc++6:i386'
extra-conf-options: '--with-target-bits=32'
apt-extra-packages: 'libfreetype-dev:i386 libtiff-dev:i386 libcupsimage2-dev:i386 libc6-i386 libgcc-s1:i386 libstdc++6:i386 libffi-dev:i386'
extra-conf-options: '--with-target-bits=32 --enable-fallback-linker --enable-libffi-bundling'
configure-arguments: ${{ github.event.inputs.configure-arguments }}
make-arguments: ${{ github.event.inputs.make-arguments }}
if: needs.select.outputs.linux-x86 == 'true'
Expand Down
11 changes: 8 additions & 3 deletions make/conf/jib-profiles.js
Original file line number Diff line number Diff line change
Expand Up @@ -426,9 +426,14 @@ var getJibProfilesProfiles = function (input, common, data) {
target_os: "linux",
target_cpu: "x86",
build_cpu: "x64",
dependencies: ["devkit", "gtest"],
configure_args: concat(common.configure_args_32bit,
"--with-jvm-variants=minimal,server", "--with-zlib=system"),
dependencies: ["devkit", "gtest", "libffi"],
configure_args: concat(common.configure_args_32bit, [
"--with-jvm-variants=minimal,server",
"--with-zlib=system",
"--with-libffi=" + input.get("libffi", "home_path"),
"--enable-libffi-bundling",
"--enable-fallback-linker"
])
},

"macosx-x64": {
Expand Down
2 changes: 1 addition & 1 deletion make/modules/java.base/Lib.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true)
NAME := fallbackLinker, \
CFLAGS := $(CFLAGS_JDKLIB) $(LIBFFI_CFLAGS), \
LDFLAGS := $(LDFLAGS_JDKLIB) \
$(call SET_SHARED_LIBRARY_ORIGIN), \
$(call SET_SHARED_LIBRARY_ORIGIN), \
LIBS := $(LIBFFI_LIBS), \
LIBS_windows := $(LIBFFI_LIBS) ws2_32.lib, \
))
Expand Down
7 changes: 5 additions & 2 deletions make/test/BuildMicrobenchmark.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ $(eval $(call SetupJavaCompilation, BUILD_INDIFY, \
#### Compile Targets

# Building microbenchmark requires the jdk.unsupported and java.management modules.
# sun.security.util is required to compile Cache benchmark
# sun.security.util is required to compile Cache benchmark.
# jmh uses annotation processors to generate the benchmark jar and thus
# requires the use of -processor option during benchmark compilation.

# Build microbenchmark suite for the current JDK
$(eval $(call SetupJavaCompilation, BUILD_JDK_MICROBENCHMARK, \
Expand All @@ -106,7 +108,8 @@ $(eval $(call SetupJavaCompilation, BUILD_JDK_MICROBENCHMARK, \
--add-exports java.base/jdk.internal.org.objectweb.asm.tree=ALL-UNNAMED \
--add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
--add-exports java.base/jdk.internal.event=ALL-UNNAMED \
--enable-preview, \
--enable-preview \
-processor org.openjdk.jmh.generators.BenchmarkProcessor, \
JAVA_FLAGS := --add-modules jdk.unsupported --limit-modules java.management \
--add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
--enable-preview, \
Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/cpu/riscv/assembler_riscv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1790,6 +1790,11 @@ enum Nf {
INSN(vlse32_v, 0b0000111, 0b110, 0b10, 0b0);
INSN(vlse64_v, 0b0000111, 0b111, 0b10, 0b0);

INSN(vsse8_v, 0b0100111, 0b000, 0b10, 0b0);
INSN(vsse16_v, 0b0100111, 0b101, 0b10, 0b0);
INSN(vsse32_v, 0b0100111, 0b110, 0b10, 0b0);
INSN(vsse64_v, 0b0100111, 0b111, 0b10, 0b0);

#undef INSN
#undef patch_VLdSt

Expand Down
7 changes: 7 additions & 0 deletions src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1289,6 +1289,13 @@ class MacroAssembler: public Assembler {
}

// vector pseudo instructions
// rotate vector register left with shift bits, 32-bit version
inline void vrole32_vi(VectorRegister vd, uint32_t shift, VectorRegister tmp_vr) {
vsrl_vi(tmp_vr, vd, 32 - shift);
vsll_vi(vd, vd, shift);
vor_vv(vd, vd, tmp_vr);
}

inline void vl1r_v(VectorRegister vd, Register rs) {
vl1re8_v(vd, rs);
}
Expand Down
141 changes: 141 additions & 0 deletions src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4277,6 +4277,142 @@ class StubGenerator: public StubCodeGenerator {
return (address) start;
}

/**
* Perform the quarter round calculations on values contained within four vector registers.
*
* @param aVec the SIMD register containing only the "a" values
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param tmp_vr temporary vector register holds intermedia values.
*/
void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
// a += b, d ^= a, d <<<= 16
__ vadd_vv(aVec, aVec, bVec);
__ vxor_vv(dVec, dVec, aVec);
__ vrole32_vi(dVec, 16, tmp_vr);

// c += d, b ^= c, b <<<= 12
__ vadd_vv(cVec, cVec, dVec);
__ vxor_vv(bVec, bVec, cVec);
__ vrole32_vi(bVec, 12, tmp_vr);

// a += b, d ^= a, d <<<= 8
__ vadd_vv(aVec, aVec, bVec);
__ vxor_vv(dVec, dVec, aVec);
__ vrole32_vi(dVec, 8, tmp_vr);

// c += d, b ^= c, b <<<= 7
__ vadd_vv(cVec, cVec, dVec);
__ vxor_vv(bVec, bVec, cVec);
__ vrole32_vi(bVec, 7, tmp_vr);
}

/**
* int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
*
* Input arguments:
* c_rarg0 - state, the starting state
* c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function
*
* Implementation Note:
* Parallelization is achieved by loading individual state elements into vectors for N blocks.
* N depends on single vector register length.
*/
address generate_chacha20Block() {
Label L_Rounds;

__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
address start = __ pc();
__ enter();

const int states_len = 16;
const int step = 4;
const Register state = c_rarg0;
const Register key_stream = c_rarg1;
const Register tmp_addr = t0;
const Register length = t1;

// Organize vector registers in an array that facilitates
// putting repetitive opcodes into loop structures below.
const VectorRegister work_vrs[16] = {
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15
};
const VectorRegister tmp_vr = v16;
const VectorRegister counter_vr = v17;

{
// Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
// in java level.
__ vsetivli(length, 16, Assembler::e32, Assembler::m1);
}

// Load from source state.
// Every element in source state is duplicated to all elements in the corresponding vector.
__ mv(tmp_addr, state);
for (int i = 0; i < states_len; i += 1) {
__ vlse32_v(work_vrs[i], tmp_addr, zr);
__ addi(tmp_addr, tmp_addr, step);
}
// Adjust counter for every individual block.
__ vid_v(counter_vr);
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);

// Perform 10 iterations of the 8 quarter round set
{
const Register loop = t2; // share t2 with other non-overlapping usages.
__ mv(loop, 10);
__ BIND(L_Rounds);

chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr);
chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr);
chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);

chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr);
chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr);

__ sub(loop, loop, 1);
__ bnez(loop, L_Rounds);
}

// Add the original state into the end working state.
// We do this by first duplicating every element in source state array to the corresponding
// vector, then adding it to the post-loop working state.
__ mv(tmp_addr, state);
for (int i = 0; i < states_len; i += 1) {
__ vlse32_v(tmp_vr, tmp_addr, zr);
__ addi(tmp_addr, tmp_addr, step);
__ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
}
// Add the counter overlay onto work_vrs[12] at the end.
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);

// Store result to key stream.
{
const Register stride = t2; // share t2 with other non-overlapping usages.
// Every block occupies 64 bytes, so we use 64 as stride of the vector store.
__ mv(stride, 64);
for (int i = 0; i < states_len; i += 1) {
__ vsse32_v(work_vrs[i], key_stream, stride);
__ addi(key_stream, key_stream, step);
}
}

// Return length of output key_stream
__ slli(c_rarg0, length, 6);

__ leave();
__ ret();

return (address) start;
}

#if INCLUDE_JFR

static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
Expand Down Expand Up @@ -4496,6 +4632,11 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
}

if (UseChaCha20Intrinsics) {
StubRoutines::_chacha20Block = generate_chacha20Block();
}

#endif // COMPILER2_OR_JVMCI
}

Expand Down
10 changes: 10 additions & 0 deletions src/hotspot/cpu/riscv/vm_version_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,16 @@ void VM_Version::initialize() {
warning("Block zeroing is not available");
FLAG_SET_DEFAULT(UseBlockZeroing, false);
}
if (UseRVV) {
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, true);
}
} else if (UseChaCha20Intrinsics) {
if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
warning("Chacha20 intrinsic requires RVV instructions (not available on this CPU)");
}
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
}

#ifdef COMPILER2
c2_initialize();
Expand Down
3 changes: 2 additions & 1 deletion src/hotspot/cpu/s390/assembler_s390.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ class RelAddr {
if ((target == nullptr) || (target == pc)) {
return 0; // Yet unknown branch destination.
} else {
guarantee(is_in_range_of_RelAddr(target, pc, shortForm), "target not within reach");
guarantee(is_in_range_of_RelAddr(target, pc, shortForm),
"target not within reach at " INTPTR_FORMAT ", distance = " INTX_FORMAT, p2i(pc), (target - pc) );
return (int)((target - pc)>>1);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/hotspot/cpu/s390/c1_MacroAssembler_s390.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void C1_MacroAssembler::lock_object(Register Rmark, Register Roop, Register Rbox
if (DiagnoseSyncOnValueBasedClasses != 0) {
load_klass(tmp, Roop);
testbit(Address(tmp, Klass::access_flags_offset()), exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
z_btrue(slow_case);
branch_optimized(Assembler::bcondAllOne, slow_case);
}

assert(LockingMode != LM_MONITOR, "LM_MONITOR is already handled, by emit_lock()");
Expand Down Expand Up @@ -170,7 +170,7 @@ void C1_MacroAssembler::unlock_object(Register Rmark, Register Roop, Register Rb
z_lg(Rmark, Address(Roop, hdr_offset));
z_lgr(tmp, Rmark);
z_nill(tmp, markWord::monitor_value);
z_brnz(slow_case);
branch_optimized(Assembler::bcondNotZero, slow_case);
lightweight_unlock(Roop, Rmark, tmp, slow_case);
} else if (LockingMode == LM_LEGACY) {
// Test if object header is pointing to the displaced header, and if so, restore
Expand Down
38 changes: 34 additions & 4 deletions src/hotspot/cpu/s390/interp_masm_s390.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "oops/markWord.hpp"
#include "oops/methodCounters.hpp"
#include "oops/methodData.hpp"
#include "oops/resolvedFieldEntry.hpp"
#include "oops/resolvedIndyEntry.hpp"
#include "prims/jvmtiExport.hpp"
#include "prims/jvmtiThreadState.hpp"
Expand Down Expand Up @@ -349,16 +350,45 @@ void InterpreterMacroAssembler::get_cache_and_index_at_bcp(Register cache, Regis
}

void InterpreterMacroAssembler::load_resolved_indy_entry(Register cache, Register index) {
// Get index out of bytecode pointer, get_cache_entry_pointer_at_bcp
// Get index out of bytecode pointer.
get_cache_index_at_bcp(index, 1, sizeof(u4));
// Get address of invokedynamic array

// Get the address of the ResolvedIndyEntry array
get_constant_pool_cache(cache);
z_lg(cache, Address(cache, in_bytes(ConstantPoolCache::invokedynamic_entries_offset())));
// Scale the index to be the entry index * sizeof(ResolvedInvokeDynamicInfo)
z_sllg(index, index, exact_log2(sizeof(ResolvedIndyEntry)));

// Scale the index to form a byte offset into the ResolvedIndyEntry array
size_t entry_size = sizeof(ResolvedIndyEntry);
if (is_power_of_2(entry_size)) {
z_sllg(index, index, exact_log2(entry_size));
} else {
z_mghi(index, entry_size);
}

// Calculate the final field address.
z_la(cache, Array<ResolvedIndyEntry>::base_offset_in_bytes(), index, cache);
}

void InterpreterMacroAssembler::load_field_entry(Register cache, Register index, int bcp_offset) {
// Get field index out of bytecode pointer.
get_cache_index_at_bcp(index, bcp_offset, sizeof(u2));

// Get the address of the ResolvedFieldEntry array.
get_constant_pool_cache(cache);
z_lg(cache, Address(cache, in_bytes(ConstantPoolCache::field_entries_offset())));

// Scale the index to form a byte offset into the ResolvedFieldEntry array
size_t entry_size = sizeof(ResolvedFieldEntry);
if (is_power_of_2(entry_size)) {
z_sllg(index, index, exact_log2(entry_size));
} else {
z_mghi(index, entry_size);
}

// Calculate the final field address.
z_la(cache, Array<ResolvedFieldEntry>::base_offset_in_bytes(), index, cache);
}

// Kills Z_R0_scratch.
void InterpreterMacroAssembler::get_cache_and_index_and_bytecode_at_bcp(Register cache,
Register cpe_offset,
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/s390/interp_masm_s390.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ class InterpreterMacroAssembler: public MacroAssembler {

void get_cache_and_index_at_bcp(Register cache, Register cpe_offset, int bcp_offset, size_t index_size = sizeof(u2));
void load_resolved_indy_entry(Register cache, Register index);
void load_field_entry(Register cache, Register index, int bcp_offset = 1);
void get_cache_and_index_and_bytecode_at_bcp(Register cache, Register cpe_offset, Register bytecode,
int byte_no, int bcp_offset, size_t index_size = sizeof(u2));
void get_cache_entry_pointer_at_bcp(Register cache, Register tmp, int bcp_offset, size_t index_size = sizeof(u2));
Expand Down

0 comments on commit 9898fed

Please sign in to comment.