Skip to content

[ktlo] Reduce CI test flakiness on Windows (#4653) #119

[ktlo] Reduce CI test flakiness on Windows (#4653)

[ktlo] Reduce CI test flakiness on Windows (#4653) #119

name: win-package-test
on:
push:
branches:
- main
pull_request:
paths:
- '.github/workflows/scripts/*.ps1'
- '.github/workflows/win-package-test.yml'
- 'cmd/otelcol/**'
- 'internal/buildscripts/packaging/choco/**'
- 'internal/buildscripts/packaging/installer/install.ps1'
- 'internal/buildscripts/packaging/jmx-metric-gatherer-release.txt'
- 'internal/buildscripts/packaging/msi/**'
- 'internal/signalfx-agent/bundle/**'
- 'Makefile'
- 'Makefile.Common'
- 'tests/zeroconfig/windows/**'
- '!**.md'
concurrency:
group: win-package-test-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
GO_VERSION: 1.21.9
jobs:
setup-environment:
# Use 20.04.5 until https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/16450 is resolved
runs-on: ubuntu-20.04
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: ${{ env.GO_VERSION }}
cache-dependency-path: '**/go.sum'
- name: Installing dependency
run: |
make install-tools
cross-compile:
# Use 20.04.5 until https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/16450 is resolved
runs-on: ubuntu-20.04
needs: [setup-environment]
strategy:
matrix:
SYS_BINARIES: [ "binaries-windows_amd64" ]
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: ${{ env.GO_VERSION }}
cache-dependency-path: '**/go.sum'
- name: Build Collector
run: |
make ${{ matrix.SYS_BINARIES }}
- name: Uploading binaries
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.SYS_BINARIES }}
path: |
./bin/*
agent-bundle-windows:
runs-on: windows-2022
env:
PIP_CACHE_DIR: ${{ github.workspace }}/.cache/pip
steps:
- uses: actions/checkout@v4
- uses: actions/cache@v4
with:
path: ${{ env.PIP_CACHE_DIR }}
key: agent-bundle-windows-pip-${{ hashFiles('internal/signalfx-agent/bundle/collectd-plugins.yaml', 'internal/signalfx-agent/bundle/scripts/requirements.txt') }}
- run: ./internal/signalfx-agent/bundle/scripts/windows/make.ps1 bundle
- uses: actions/upload-artifact@v4
with:
name: agent-bundle-windows
path: ./dist/agent-bundle_windows_amd64.zip
msi-custom-actions:
runs-on: windows-2022
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
- name: Build Custom Actions
working-directory: internal/buildscripts/packaging/msi/SplunkCustomActions
run: |
dotnet test ./test/SplunkCustomActionsTests.csproj -c Release
dotnet publish ./src/SplunkCustomActions.csproj -c Release -o ./bin/Release
- name: Package Custom Actions
run: |
$WixPath = "${Env:ProgramFiles(x86)}\WiX Toolset v3.14"
$sfxcaDll = "${WixPath}\SDK\x64\sfxca.dll"
$Env:PATH = "${WixPath}\SDK;" + $Env:PATH
$customActionDir = "${PWD}\internal\buildscripts\packaging\msi\SplunkCustomActions"
$customActionBinDir = "${customActionDir}\bin\Release"
MakeSfxCA.exe "${PWD}\dist\SplunkCustomActions.CA.dll" `
"${sfxcaDll}" `
"${customActionBinDir}\SplunkCustomActions.dll" `
"${customActionBinDir}\Microsoft.Deployment.WindowsInstaller.dll" `
"${customActionDir}\src\CustomAction.config"
- uses: actions/upload-artifact@v4
with:
name: msi-custom-actions
path: ./dist/SplunkCustomActions.CA.dll
msi-build:
# Use 20.04.5 until https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/16450 is resolved
runs-on: ubuntu-20.04
needs: [cross-compile, agent-bundle-windows, msi-custom-actions]
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Downloading binaries-windows_amd64
uses: actions/download-artifact@v4
with:
name: binaries-windows_amd64
path: ./bin
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: ${{ env.GO_VERSION }}
cache-dependency-path: '**/go.sum'
- name: Downloading agent-bundle-windows
uses: actions/download-artifact@v4
with:
name: agent-bundle-windows
path: ./dist
- name: Downloading msi-custom-actions
uses: actions/download-artifact@v4
with:
name: msi-custom-actions
path: ./internal/buildscripts/packaging/msi/SplunkCustomActions/bin/Release
- name: Build MSI
run: |
mkdir -p dist
make msi SKIP_COMPILE=true VERSION=""
- name: Uploading msi build artifacts
uses: actions/upload-artifact@v4
with:
name: msi-build
path: ./dist/*.msi
dotnet-zeroconfig-e2e-test:
runs-on: windows-2022
needs: [msi-build]
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
- name: Setup NuGet
uses: nuget/setup-nuget@v2.0.0
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v2
- uses: actions/setup-dotnet@v4.0.0
with:
dotnet-version: |
6.0.407
- name: Download Splunk OTel Collector msi
uses: actions/download-artifact@v4
with:
name: msi-build
path: ./tests/zeroconfig/windows/testdata/docker-setup/
- name: Get splunk-otel-dotnet latest release name
id: dotnet-instrumentation
env:
GH_TOKEN: ${{ github.token }}
run: |
$release = gh release view --repo signalfx/splunk-otel-dotnet --json name | ConvertFrom-Json | select name
"release=$($release.name)" | Out-File -FilePath $env:GITHUB_OUTPUT -Append
- name: Set SPLUNK_OTEL_DOTNET_VERSION
run: |
version="${{ steps.dotnet-instrumentation.outputs.release }}"
sed -i "s|SPLUNK_OTEL_DOTNET_VERSION|${version#v}|" tests/zeroconfig/windows/testdata/resource_traces/aspnetcore.yaml
sed -i "s|SPLUNK_OTEL_DOTNET_VERSION|${version#v}|" tests/zeroconfig/windows/testdata/resource_traces/aspnetfx.yaml
shell: bash
- name: Run the test script
working-directory: tests/zeroconfig/windows/
run: |
.\run-tests.ps1
msi-without-wrappers-test:
runs-on: ${{ matrix.OS }}
needs: [msi-build]
timeout-minutes: 15
strategy:
max-parallel: 1
matrix:
OS: [ "windows-2022" ]
install-properties:
- test: "default"
install: "SPLUNK_ACCESS_TOKEN=fakeToken"
expected: "-mode agent -access_token fakeToken -realm us0 -memory $null -with_fluentd false"
- test: "gateway"
install: "SPLUNK_SETUP_COLLECTOR_MODE=gateway SPLUNK_ACCESS_TOKEN=testing123"
expected: "-mode gateway -access_token testing123 -realm us0 -memory $null -with_fluentd false"
- test: "realm"
install: "SPLUNK_REALM=myrealm SPLUNK_ACCESS_TOKEN=testing"
expected: "-mode agent -access_token testing -realm myrealm -memory $null -with_fluentd false"
- test: "ingest-url"
install: "SPLUNK_INGEST_URL=https://fake.ingest.url SPLUNK_ACCESS_TOKEN=testing"
expected: "-mode agent -realm us0 -access_token testing -ingest_url \"https://fake.ingest.url\" -memory $null -with_fluentd false"
- test: "optional-params"
install: "SPLUNK_ACCESS_TOKEN=fakeToken SPLUNK_MEMORY_TOTAL_MIB=256"
expected: "-mode agent -access_token fakeToken -realm us0 -memory 256 -with_fluentd false"
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
- name: Downloading msi build
uses: actions/download-artifact@v4
with:
name: msi-build
path: ./dist
- name: Ensure required ports in the dynamic range are available
run: |
$ErrorActionPreference = 'Continue'
& ${{ github.workspace }}\.github\workflows\scripts\win-required-ports.ps1
- name: Install the collector
run: |
$ErrorActionPreference = 'Stop'
$msi_path = Resolve-Path .\dist\splunk-otel-collector*.msi
Test-Path $msi_path
Write-Host "Installing $msi_path ..."
$process = Start-Process -Wait -PassThru msiexec "/i `"$msi_path`" /qn /l*v msi-install.log ${{ matrix.install-properties.install }}"
if ($process.ExitCode -ne 0) {
throw "MSI installation failed with exit code $($process.ExitCode)"
}
- name: "In case of failure: display the install logs"
if: ${{ failure() }}
run: Get-Content msi-install.log
- name: Start the collector service
run:
Start-Service splunk-otel-collector
- name: Check registry expectations
run: |
$ErrorActionPreference = 'Stop'
& ${{ github.workspace }}\.github\workflows\scripts\win-test-services.ps1 ${{ matrix.install-properties.expected }}
- name: Stop the collector service
run:
Stop-Service splunk-otel-collector
- name: "In case of failure: collect splunk-otel-collector events"
if: ${{ failure() }}
run: Get-WinEvent -ProviderName splunk-otel-collector | Sort-Object -Property TimeCreated | Select-Object -Property Message | Format-List
- name: Uninstall the collector
run: |
$ErrorActionPreference = 'Stop'
$uninstall_info = Get-ItemProperty HKLM:\Software\Microsoft\Windows\CurrentVersion\uninstall\* | Where { $_.DisplayName -eq "Splunk OpenTelemetry Collector" }
Write-Host "Uninstalling $($uninstall_info.PSChildName) ..."
$process = Start-Process -Wait -PassThru msiexec "/x $($uninstall_info.PSChildName) /qn /l*v msi-uninstall.log"
if ($process.ExitCode -ne 0) {
throw "MSI installation failed with exit code $($process.ExitCode)"
}
- name: Check no leftover on the registry
run: |
$ErrorActionPreference = 'Stop'
$uninstall_info = Get-ItemProperty HKLM:\Software\Microsoft\Windows\CurrentVersion\uninstall\* | Where { $_.DisplayName -eq "Splunk OpenTelemetry Collector" }
if ($uninstall_info -ne $null) {
throw "MSI uninstall failed"
}
- name: "In case of failure: display the uninstall logs"
if: ${{ failure() }}
run: Get-Content msi-uninstall.log
msi-test:
runs-on: ${{ matrix.OS }}
needs: [msi-build]
strategy:
max-parallel: 2
matrix:
OS: [ "windows-2022" ]
MODE: [ "agent", "gateway" ]
WITH_FLUENTD: [ "true", "false" ]
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
- name: Downloading msi build
uses: actions/download-artifact@v4
with:
name: msi-build
path: ./dist
- name: Ensure required ports in the dynamic range are available
run: |
$ErrorActionPreference = 'Continue'
& ${{ github.workspace }}\.github\workflows\scripts\win-required-ports.ps1
- name: Installation test
env:
token: fake-token
realm: fake-realm
memory: "256"
run: |
$ErrorActionPreference = 'Stop'
Set-PSDebug -Trace 1
$msi_path = Resolve-Path .\dist\splunk-otel-collector*.msi
$env:VERIFY_ACCESS_TOKEN = "false"
.\internal\buildscripts\packaging\installer\install.ps1 -access_token "${{ env.token }}" -realm "${{ env.realm }}" -msi_path "$msi_path" -mode "${{ matrix.MODE }}" -memory "${{ env.memory }}" -with_fluentd $${{ matrix.WITH_FLUENTD }}
Start-Sleep -s 30
& ${{ github.workspace }}\.github\workflows\scripts\win-test-services.ps1 -mode "${{ matrix.MODE }}" -access_token "${{ env.token }}" -realm "${{ env.realm }}" -memory "${{ env.memory }}" -with_fluentd "${{ matrix.WITH_FLUENTD }}"
& ${{ github.workspace }}\.github\workflows\scripts\win-test-support-bundle.ps1 -mode "${{ matrix.MODE }}" -with_fluentd "${{ matrix.WITH_FLUENTD }}"
Resolve-Path -Path "$env:ProgramFiles\Splunk\OpenTelemetry Collector\agent-bundle\python\python.exe"
Resolve-Path -Path "$env:ProgramFiles\Splunk\OpenTelemetry Collector\agent-bundle\collectd-python"
# The JMX file is installed under Disk:\opt, so we need to check each available disk to see if it's installed
$found_jmx = $false
(Get-PSDrive -PSProvider FileSystem).Root | ForEach-Object {
$found_jmx = $found_jmx -Or (Test-Path (Join-Path "$_" "opt\opentelemetry-java-contrib-jmx-metrics.jar"))
}
if (-not $found_jmx) { throw "Could not find expected jar file" }
- name: splunk-otel-collector logs
if: ${{ always() }}
run: Get-WinEvent -ProviderName splunk-otel-collector | Sort-Object -Property TimeCreated | Select-Object -Property Message | Format-List
- name: fluentd logs
if: ${{ always() && matrix.WITH_FLUENTD == 'true' }}
run: Get-Content -Path "${env:SYSTEMDRIVE}\opt\td-agent\td-agent.log"
choco-build:
runs-on: windows-2022
needs: [msi-build]
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
- name: Downloading msi build
uses: actions/download-artifact@v4
with:
name: msi-build
path: ./dist
- name: Build Chocolatey
run: |
$ErrorActionPreference = 'Stop'
Set-PSDebug -Trace 1
$msi_file_path = Resolve-Path .\dist\splunk-otel-collector*.msi
$msi_file_name = Resolve-Path .\dist\splunk-otel-collector*.msi | Split-Path -leaf
$msi_file_name -match '(\d+\.)(\d+\.)+(\d*)'
$version = $matches[0]
write-host "Building choco package..."
.\internal\buildscripts\packaging\choco\make.ps1 build_choco -MSIFile $msi_file_path -Version $version | Tee-Object -file .\dist\build_logs.log
Test-Path -Path ".\dist\splunk-otel-collector.$version.nupkg"
- name: Test install without parameters
run: |
$ErrorActionPreference = 'Stop'
Set-PSDebug -Trace 1
choco install splunk-otel-collector -s=".\dist" -y
if ($LASTEXITCODE -ne 0) {
throw "choco install failed!"
}
# the collector service should not be running if installed without the SPLUNK_ACCESS_TOKEN parameter
if ((Get-CimInstance -ClassName win32_service -Filter "Name = 'splunk-otel-collector'" | Select Name, State).State -Eq "Running") {
throw "splunk-otel-collector is running"
}
- name: Uploading choco build artifacts
uses: actions/upload-artifact@v4
with:
name: choco-build
path: ./dist/*.nupkg
choco-test:
runs-on: ${{ matrix.OS }}
needs: [choco-build]
strategy:
max-parallel: 4
matrix:
OS: [ "windows-2022" ]
MODE: [ "agent", "gateway" ]
WITH_FLUENTD: [ "true", "false" ]
SCENARIO: [ "install", "upgrade" ]
fail-fast: false
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
- name: Downloading choco build
uses: actions/download-artifact@v4
with:
name: choco-build
path: ./dist
- name: Ensure required ports in the dynamic range are available
run: |
$ErrorActionPreference = 'Continue'
& ${{ github.workspace }}\.github\workflows\scripts\win-required-ports.ps1
- name: Chocolatey ${{ matrix.SCENARIO }} test
env:
token: fake-token
realm: fake-realm
memory: "256"
run: |
$ErrorActionPreference = 'Stop'
Set-PSDebug -Trace 1
$choco_file_name = Resolve-Path .\dist\splunk-otel-collector*.nupkg
$params = "/SPLUNK_ACCESS_TOKEN=${{ env.token }} /SPLUNK_REALM=${{ env.realm }} /SPLUNK_MEMORY_TOTAL_MIB=${{ env.memory }} /MODE:${{ matrix.MODE }} /WITH_FLUENTD:${{ matrix.WITH_FLUENTD }}"
if ("${{ matrix.SCENARIO }}" -eq "install") {
write-host "Installing $choco_file_name ..."
choco install splunk-otel-collector -s=".\dist" --params="'$params'" -y
if ($LASTEXITCODE -ne 0) {
throw "choco install failed!"
}
} else {
write-host "Installing splunk-otel-collector 0.74.0 ..."
choco feature enable -n=useRememberedArgumentsForUpgrades
choco install splunk-otel-collector --no-progress --version=0.74.0 --params="'$params'" -y
if ($LASTEXITCODE -ne 0) {
throw "choco install failed!"
}
Start-Sleep 30
write-host "Upgrading $choco_file_name ..."
choco upgrade splunk-otel-collector -s=".\dist" -y
if ($LASTEXITCODE -ne 0) {
throw "choco upgrade failed!"
}
}
Start-Sleep -s 30
& ${{ github.workspace }}\.github\workflows\scripts\win-test-services.ps1 -mode "${{ matrix.MODE }}" -access_token "${{ env.token }}" -realm "${{ env.realm }}" -memory "${{ env.memory }}" -with_fluentd "${{ matrix.WITH_FLUENTD }}"
& ${{ github.workspace }}\.github\workflows\scripts\win-test-support-bundle.ps1 -mode "${{ matrix.MODE }}" -with_fluentd "${{ matrix.WITH_FLUENTD }}"
- name: splunk-otel-collector logs
if: ${{ always() }}
run: Get-WinEvent -ProviderName splunk-otel-collector | Sort-Object -Property TimeCreated | Select-Object -Property Message | Format-List
- name: fluentd logs
if: ${{ always() && matrix.WITH_FLUENTD == 'true' }}
run: Get-Content -Path "${env:SYSTEMDRIVE}\opt\td-agent\td-agent.log"
- name: Uninstall test
run: |
choco uninstall splunk-otel-collector -y
if ($LASTEXITCODE -ne 0) {
throw "choco uninstall failed!"
}
Start-Sleep -s 30
if ((Get-CimInstance -ClassName win32_service -Filter "Name = 'splunk-otel-collector'" | Select Name, State).State -Eq "Running") {
throw "splunk-otel-collector service is still running"
}
docker-otelcol:
runs-on: ${{ matrix.OS }}
needs: [cross-compile, agent-bundle-windows]
strategy:
matrix:
OS: [ "windows-2022" ]
steps:
- name: Check out the codebase.
uses: actions/checkout@v4
- name: Downloading binaries-windows_amd64
uses: actions/download-artifact@v4
with:
name: binaries-windows_amd64
path: ./bin
- uses: actions/download-artifact@v4
with:
name: agent-bundle-windows
path: ./dist
- name: Build docker image
run: |
$ErrorActionPreference = 'Stop'
Copy-Item .\bin\otelcol_windows_amd64.exe .\cmd\otelcol\otelcol.exe
Copy-Item .\dist\agent-bundle_windows_amd64.zip .\cmd\otelcol\agent-bundle_windows_amd64.zip
docker build -t otelcol-windows --build-arg BASE_IMAGE=mcr.microsoft.com/windows/servercore:ltsc2022 --build-arg JMX_METRIC_GATHERER_RELEASE=$(Get-Content internal\buildscripts\packaging\jmx-metric-gatherer-release.txt) -f .\cmd\otelcol\Dockerfile.windows .\cmd\otelcol\
Remove-Item .\cmd\otelcol\otelcol.exe
Remove-Item .\cmd\otelcol\agent-bundle_windows_amd64.zip
- name: Run docker image
run: |
$ErrorActionPreference = 'Stop'
docker run -d -e SPLUNK_ACCESS_TOKEN=12345 -e SPLUNK_REALM=fake-realm --name otelcol otelcol-windows:latest
Start-Sleep 10
$DockerOutput=$(docker ps --filter=status=running --filter=name=otelcol -q)
if ( $DockerOutput -eq $null ) {
docker logs otelcol
echo "Failing job execution: fail to start otelcol docker container in 10 seconds."
exit 1
}
$JavaCmdOutput=$(docker exec -it otelcol java -version)
if ( ${JavaCmdOutput:0::15} -ne 'openjdk version' ) {
docker logs otelcol
echo "Failing job execution: Failed to run java, command output: ${JavaCmdOutput}"
}