From 010aefc197a480ff028308ea2e890507a19941b1 Mon Sep 17 00:00:00 2001
From: Lau Johansson <ljo@delegate.dk>
Date: Fri, 5 May 2023 23:35:30 +0200
Subject: [PATCH] feat: use spetlr-tools for intgration test

---
 .github/submit/cluster104.json        |  27 ++++
 .github/submit/cluster91.json         |  27 ++++
 .github/submit/fetch_test_job.ps1     |  88 -------------
 .github/submit/main.py                |  46 -------
 .github/submit/submit_test_job.ps1    | 182 --------------------------
 .github/workflows/pre-integration.yml |  28 ++--
 6 files changed, 71 insertions(+), 327 deletions(-)
 create mode 100644 .github/submit/cluster104.json
 create mode 100644 .github/submit/cluster91.json
 delete mode 100644 .github/submit/fetch_test_job.ps1
 delete mode 100644 .github/submit/main.py
 delete mode 100644 .github/submit/submit_test_job.ps1

diff --git a/.github/submit/cluster104.json b/.github/submit/cluster104.json
new file mode 100644
index 00000000..176b8463
--- /dev/null
+++ b/.github/submit/cluster104.json
@@ -0,0 +1,27 @@
+{
+    "spark_version": "10.4.x-scala2.12",
+    "spark_conf": {
+      "spark.databricks.cluster.profile": "singleNode",
+      "spark.master": "local[*, 4]",
+      "spark.databricks.delta.preview.enabled": true,
+      "spark.databricks.io.cache.enabled":true
+    },
+    "azure_attributes": {
+      "availability": "ON_DEMAND_AZURE",
+      "first_on_demand": 1,
+      "spot_bid_max_price": -1
+    },
+    "node_type_id": "Standard_DS3_v2",
+    "custom_tags": {
+      "ResourceClass":"SingleNode"
+    },
+    "spark_env_vars": {
+      "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
+    },
+    "cluster_log_conf": {
+      "dbfs": {
+        "destination": ""
+      }
+    },
+    "num_workers": 0
+  }
\ No newline at end of file
diff --git a/.github/submit/cluster91.json b/.github/submit/cluster91.json
new file mode 100644
index 00000000..cf383719
--- /dev/null
+++ b/.github/submit/cluster91.json
@@ -0,0 +1,27 @@
+{
+    "spark_version": "9.1.x-scala2.12",
+    "spark_conf": {
+      "spark.databricks.cluster.profile": "singleNode",
+      "spark.master": "local[*, 4]",
+      "spark.databricks.delta.preview.enabled": true,
+      "spark.databricks.io.cache.enabled":true
+    },
+    "azure_attributes": {
+      "availability": "ON_DEMAND_AZURE",
+      "first_on_demand": 1,
+      "spot_bid_max_price": -1
+    },
+    "node_type_id": "Standard_DS3_v2",
+    "custom_tags": {
+      "ResourceClass":"SingleNode"
+    },
+    "spark_env_vars": {
+      "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
+    },
+    "cluster_log_conf": {
+      "dbfs": {
+        "destination": ""
+      }
+    },
+    "num_workers": 0
+  }
\ No newline at end of file
diff --git a/.github/submit/fetch_test_job.ps1 b/.github/submit/fetch_test_job.ps1
deleted file mode 100644
index 490941ab..00000000
--- a/.github/submit/fetch_test_job.ps1
+++ /dev/null
@@ -1,88 +0,0 @@
-# once a test run has been submitted with submit_test_job, and a test_job_details.json file
-# is available, you can immediately run this script to fetch the result.
-# It will poll the databricks api with 5 second intervals until the job
-# has ended (reporting available progress along the way).
-# It will then attempt to fetch the results.log file that was written by the jobs main function
-# In the very end, if all tests succeeded, then the job will have succeeded, and then this script succeeds
-param (
-
-  [Parameter(Mandatory=$false)]
-  [ValidateNotNullOrEmpty()]
-  [string]
-  $testJobDetails= "test_job_details.json"
-
-)
-
-if(-not (Test-Path -Path $testJobDetails -PathType Leaf)){
-    Write-Host -ForegroundColor Red "ERROR: The file $testJobDetails does not exist. Please run submit_test_job.ps1 first."
-    EXIT 1
-}
-
-
-# import utility functions
-. "$PSScriptRoot\..\deploy\Utilities\all.ps1"
-
-$job_details = Get-Content $testJobDetails | ConvertFrom-Json
-
-$runId = $job_details.runId
-$testDir = $job_details.testDir
-$resultLogs = $job_details.logOut
-$srcDir = "$PSScriptRoot/"
-
-
-# report on status
-Write-Host "============================================================================"
-Write-Host "Run with ID $runId"
-Write-Host "Test dir $testDir"
-
-$run = (databricks runs get --run-id $runId | ConvertFrom-Json)
-Write-Host "Run url: $($run.run_page_url)"
-$clusterID = $run.cluster_instance.cluster_id
-$state = ""
-$state_msg = ""
-while ($run.end_time -eq 0){
-    Start-Sleep -Seconds 5
-
-    $run = (databricks runs get --run-id $runId | ConvertFrom-Json)
-    $clusterID = $run.cluster_instance.cluster_id
-
-    # display the messages if they have changed
-    if($run.state.life_cycle_state -ne $state){
-        $state = $run.state.life_cycle_state
-        Write-Host "Run is now in state $state"
-    }
-    if($run.state.state_message -ne $state_msg){
-        $state_msg = $run.state.state_message
-        if($state_msg){
-            Write-Host "Run message: $state_msg"
-        }
-    }
-}
-
-Write-Host "Run has ended. Now fetching logs..."
-# the job is complete. Get the logs
-$timeout = 60
-$localLogs="$srcDir/test_job_results_$($job_details.submissionTime).log"
-do{
-    dbfs cp --overwrite $resultLogs $localLogs *>$null
-    if($LASTEXITCODE -eq 0) {break;}
-
-    $timeout-=1
-    Start-Sleep -Seconds 1
-} until( $timeout -lt 1 )
-
-if($timeout -lt 1){
-    throw "Unable to get logs from $resultLogs"
-}
-
-Write-Host "Logs can be seen in $localLogs"
-Write-Host "============================================================================"
-
-Get-Content $localLogs
-
-Write-Host "Overall the result is $($run.state.result_state)"
-if($run.state.result_state -eq "SUCCESS"){
-    EXIT 0
-}else {
-    EXIT 1
-}
diff --git a/.github/submit/main.py b/.github/submit/main.py
deleted file mode 100644
index d83cc48e..00000000
--- a/.github/submit/main.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import argparse
-import contextlib
-import io
-import os
-import sys
-import unittest
-
-
-def run_all():
-    parser = argparse.ArgumentParser(description="Run Test Cases.")
-    parser.add_argument(
-        "--basedir", type=str, required=True, help="parent location of test library"
-    )
-    parser.add_argument(
-        "--folder", type=str, required=True, help="which test folder to run"
-    )
-
-    args = parser.parse_args()
-
-    # process the logout path
-    basedir = args.basedir
-    if not str(basedir).startswith("dbfs:/"):
-        print("WARNING: argument basedir must start with dbfs:/")
-    else:
-        basedir = f"/dbfs/{basedir[6:]}"
-
-    os.chdir(basedir)
-
-    sys.path = [os.getcwd()] + sys.path
-
-    suite = unittest.TestLoader().discover(args.folder)
-    with io.StringIO() as buf:
-        # run the tests
-        with contextlib.redirect_stdout(buf):
-            res = unittest.TextTestRunner(stream=buf, failfast=True).run(suite)
-        output = buf.getvalue()
-        print(output)
-        with open("results.log", "w") as f:
-            f.write(output)
-
-        return 0 if res.wasSuccessful() else 1
-
-
-if __name__ == "__main__":
-    if int(run_all()):
-        sys.exit(1)
diff --git a/.github/submit/submit_test_job.ps1 b/.github/submit/submit_test_job.ps1
deleted file mode 100644
index b88b5c28..00000000
--- a/.github/submit/submit_test_job.ps1
+++ /dev/null
@@ -1,182 +0,0 @@
-# This script submits a run to databricks to execute the complete test-suite
-# as a prerequisite, the library under test should already be built
-# This script does a the following things in order:
-#  - creates a unique test area on the databricks file system (dbfs)
-#  - clean the entire tests folder and copies it to dbfs
-#  - copies the library to the test area
-#  - submits a job run to databricks that
-#    - installs the library from the test area
-#    - executes a main file that
-#      - discovers unittests and executes them within the current python interpreter
-#        (only the current interpreter carries references to the spark runtime.
-#        calling subprocesses does not work.)
-#      - writes all test stdout to a log file in the test area once finished.
-#  - once the job is submitted, all job details are written to a json file
-#    The details json file can be used to fetch the job result with the fetch_test_job script
-
-param (
-  # to submit parallel runs, you must specify this parameter
-  [Parameter(Mandatory=$false)]
-  [ValidateNotNullOrEmpty()]
-  [string]
-  $testJobDetails= "test_job_details.json",
-
-  # in the pipeline we wish to test with multiple versions.
-  [Parameter(Mandatory=$false)]
-  [ValidateNotNullOrEmpty()]
-  [string]
-  $sparkVersion = "9.1.x-scala2.12",
-
-  [Parameter(Mandatory=$false)]
-  [ValidateNotNullOrEmpty()]
-  [string]
-  $sparkLibs = "sparklibs91.json"
-
-
-)
-
-# get the true repository root
-$repoRoot = (git rev-parse --show-toplevel)
-
-
-# start time of this script for job details
-$now = (Get-Date -Format yyyy-MM-ddTHH.mm)
-
-
-
-# import utility functions
-. "$PSScriptRoot\..\deploy\Utilities\all.ps1"
-$spark_dependencies = Get-Content "$PSScriptRoot/$sparkLibs" | ConvertFrom-Json
-
-
-# for separating tasks, we will do everything in our own dir (allows parallel jobs):
-$testguid = "$([guid]::NewGuid())"
-$testDir = "dbfs:/test/$([guid]::NewGuid())"
-dbfs mkdirs $testDir
-
-# discover libraries in the dist folder
-[array]$libs = Get-ChildItem -Path "$repoRoot/dist" -Filter *.whl | ForEach-Object -Member name
-Write-Host "To be installed on cluster: $($libs -join ", ")"
-[array]$sparkWheels =  $libs | ForEach-Object -Process {@{whl = "$testDir/dist/$_"}}
-
-[array]$testWheels = Get-Content -Path "$repoRoot/test_requirements.txt" | ForEach-Object -Process {@{pypi = @{package="$_"}}}
-
-
-# upload the library
-dbfs cp -r --overwrite  "$repoRoot/dist" "$testDir/dist"
-
-# upload the test main file
-dbfs cp --overwrite  "$PSScriptRoot/main.py" "$testDir/main.py"
-
-
-# next step is to upload all unittests
-Push-Location -Path $repoRoot
-
-pip install pyclean
-pyclean tests # remove *.pyc and __pycache__
-# upload all tests
-dbfs cp --overwrite -r tests/ "$testDir/tests"
-
-Pop-Location
-
-# remote path of the log
-$logOut = "$testDir/results.log"
-
-# construct the run submission configuration
-$run = @{
-  run_name = "Testing Run"
-  # single node cluster is sufficient
-  new_cluster= @{
-    spark_version=$sparkVersion
-    spark_conf= @{
-      "spark.databricks.cluster.profile"= "singleNode"
-      "spark.master"= "local[*, 4]"
-      "spark.databricks.delta.preview.enabled"= $true
-      "spark.databricks.io.cache.enabled"= $true
-    }
-    azure_attributes=${
-                "availability"= "ON_DEMAND_AZURE",
-                "first_on_demand": 1,
-                "spot_bid_max_price": -1
-            }
-    node_type_id= "Standard_DS3_v2"
-    custom_tags =@{
-      ResourceClass="SingleNode"
-    }
-    spark_env_vars= @{
-      PYSPARK_PYTHON= "/databricks/python3/bin/python3"
-    }
-    cluster_log_conf= @{
-      dbfs=@{
-        destination="$testDir/cluster-logs"
-      }
-    }
-    num_workers= 0
-  }
-  # in addition to standard dependencies, install the libs that we just uploaded
-  libraries= $spark_dependencies + $sparkWheels + $testWheels
-
-  # This scripts runs the test suite
-  spark_python_task= @{
-    python_file="$testDir/main.py"
-    parameters=@(
-      # running in the spark python interpreter, the python __file__ variable does not
-      # work. Hence, we need to tell the script where the test area is.
-      "--basedir=$testDir",
-      # we can actually run any part of out test suite, but some files need the full repo.
-      # Only run tests from this folder.
-      "--folder=tests/cluster"
-    )
-  }
-}
-
-# We used to get this warning:
-# WARN: Your CLI is configured to use Jobs API 2.0. In order to use the latest Jobs features please upgrade to 2.1: 'databricks jobs configure --version=2.1'. Future versions of
-# this CLI will default to the new Jobs API. Learn more at https://docs.databricks.com/dev-tools/cli/jobs-cli.html
-databricks jobs configure --version=2.1
-
-# databricks runs submit actually has an option to pass the json on the command line.
-# But here we need to do this with a json file because the json string is pretty funky and it breaks otherwise
-Set-Content "$repoRoot/run.json" ($run | ConvertTo-Json -Depth 4)
-# submit the run and save the ID
-$runId = (databricks runs submit --json-file "$repoRoot/run.json" | ConvertFrom-Json).run_id
-Remove-Item "$repoRoot/run.json"
-
-# report on status
-Write-Host "============================================================================"
-Write-Host "Started Run with ID $runId"
-Write-Host "Using test dir $testDir"
-
-$run = (databricks runs get --run-id $runId | ConvertFrom-Json)
-Write-Host "Run url: $($run.run_page_url)"
-
-# Roll the test details. When testing locally, this makes it easier to recover old runs.
-if(Test-Path -Path $testJobDetails -PathType Leaf)
-{
-  $old_job_details = Get-Content $testJobDetails | ConvertFrom-Json
-  $new_filename = "$(Split-Path -LeafBase $testJobDetails).$($old_job_details.submissionTime).json"
-  $parent = Split-Path -Parent $testJobDetails
-  if ($parent)
-  {
-    $new_filename = Join-Path -Path $parent -ChildPath $new_filename
-  }
-  Set-Content "$new_filename" ($old_job_details | ConvertTo-Json -Depth 4)
-  Write-Host "Previous details at $testJobDetails were moved to $new_filename."
-}
-
-# write the test details file
-$job_details = @{
-  runId=$runId
-  testDir=$testDir
-  submissionTime=$now
-  testFolder=$testFolder
-  environmentType= $environmentType
-  environmentName=$environmentName
-  logOut=$logOut
-}
-Set-Content "$testJobDetails" ($job_details | ConvertTo-Json -Depth 4)
-
-Write-Host "test job details written to $testJobDetails"
-Write-Host "you can now use fetch_test_job.ps1 to check and collect the result of your test run."
-Write-Host "============================================================================"
-
diff --git a/.github/workflows/pre-integration.yml b/.github/workflows/pre-integration.yml
index d289b088..29fe5571 100644
--- a/.github/workflows/pre-integration.yml
+++ b/.github/workflows/pre-integration.yml
@@ -54,7 +54,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install setuptools wheel twine packaging
+          pip install setuptools wheel twine packaging spetlr-tools
       - name: Build
         shell: pwsh
         run: .github/submit/build.ps1
@@ -80,19 +80,25 @@ jobs:
       - name: Launch integration tests 9.1
         shell: pwsh
         run: |
-          .github/submit/submit_test_job.ps1  `
-            -sparkVersion "9.1.x-scala2.12" `
-            -testJobDetails job91.json `
-            -sparkLibs sparklibs91.json
+          spetlr-test-job submit `
+            --tests tests/ `
+            --task tests/cluster/ `
+            --cluster-file .github/submit/cluster91.json `
+            --requirements-file test_requirements.txt `
+            --sparklibs-file .github/submit/sparklibs91.json `
+            --out-json test91.json
 
 
       - name: Launch integration tests 10.4
         shell: pwsh
         run: |
-          .github/submit/submit_test_job.ps1  `
-            -sparkVersion "10.4.x-scala2.12" `
-            -testJobDetails job104.json `
-            -sparkLibs sparklibs104.json
+          spetlr-test-job submit `
+            --tests tests/ `
+            --task tests/cluster/ `
+            --cluster-file .github/submit/cluster104.json `
+            --requirements-file test_requirements.txt `
+            --sparklibs-file .github/submit/sparklibs104.json `
+            --out-json test104.json
 
       - name: Wait 2 min for things to settle
         shell: pwsh
@@ -100,11 +106,11 @@ jobs:
 
       - name: Fetch integration tests 9.1
         shell: pwsh
-        run: .github/submit/fetch_test_job.ps1 -testJobDetails job91.json
+        run: spetlr-test-job fetch --runid-json test91.json
 
       - name: Fetch integration tests 10.4
         shell: pwsh
-        run: .github/submit/fetch_test_job.ps1 -testJobDetails job104.json
+        run: spetlr-test-job fetch --runid-json test104.json
 
       - name: Delete Deployment
         if: always() # this step runs even if the pipeline is manually cancelled