Skip to content

Commit

Permalink
Fix CI and Regression workflows for MAC Arm64 (#3128)
Browse files Browse the repository at this point in the history
* Update ci and regression cpu workflow for MAC

* Skip MPS tests when not running only on M1 CPU

* Fix test parametrization

* Fix env variable config to skip MPS tests

* Update workflow files

* Upgrade nick-fields/retry to v3

* Fix test import error

* Fix Mac M1 CPU only tests

* Fix env variable comparison

* Fix Mac M1 regression test
  • Loading branch information
namannandan committed May 9, 2024
1 parent 0b4539f commit 087e813
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 19 deletions.
13 changes: 8 additions & 5 deletions .github/workflows/ci_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,19 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-20.04, macOS-latest]
os: [ubuntu-20.04, macos-latest]
steps:
- name: Setup Python for M1
if: matrix.os == 'macos-14'
if: matrix.os == 'macos-latest'
uses: actions/setup-python@v5
with:
python-version: '3.10'
architecture: arm64
- name: Setup Python for all other OS
if: matrix.os != 'macos-14'
if: matrix.os != 'macos-latest'
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.9'
architecture: x64
- name: Setup Java 17
uses: actions/setup-java@v3
Expand All @@ -47,7 +48,9 @@ jobs:
run: |
python ts_scripts/install_dependencies.py --environment=dev
- name: Torchserve Sanity
uses: nick-fields/retry@v2
env:
TS_MAC_ARM64_CPU_ONLY: ${{ matrix.os == 'macos-latest' && 'True' || 'False' }}
uses: nick-fields/retry@v3
with:
timeout_minutes: 60
max_attempts: 3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
run: |
python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121
- name: Torchserve Sanity
uses: nick-fields/retry@v2
uses: nick-fields/retry@v3
with:
timeout_minutes: 60
retry_on: error
Expand Down
13 changes: 8 additions & 5 deletions .github/workflows/regression_tests_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,24 @@ concurrency:

jobs:
regression-cpu:
# creates workflows for OS: ubuntu, macOS, macOS M1
# creates workflows for OS: ubuntu, macOS M1
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-20.04, macOS-latest]
os: [ubuntu-20.04, macos-latest]
steps:
- name: Setup Python for M1
if: matrix.os == 'macos-14'
if: matrix.os == 'macos-latest'
uses: actions/setup-python@v5
with:
python-version: '3.10'
architecture: arm64
- name: Setup Python for all other OS
if: matrix.os != 'macos-14'
if: matrix.os != 'macos-latest'
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.9'
architecture: x64
- name: Setup Java 17
uses: actions/setup-java@v3
Expand All @@ -46,5 +47,7 @@ jobs:
run: |
python ts_scripts/install_dependencies.py --environment=dev
- name: Torchserve Regression Tests
env:
TS_MAC_ARM64_CPU_ONLY: ${{ matrix.os == 'macos-latest' && 'True' || 'False' }}
run: |
python test/regression_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,8 @@ private static int getAvailableGpu() {
}
}
}
throw new AssertionError("Unexpected response.");
// No MPS devices detected
return 0;
} else {
Process process =
Runtime.getRuntime().exec("nvidia-smi --query-gpu=index --format=csv");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,13 @@ public void testNumGpuM1() throws ReflectiveOperationException, IOException {
ConfigManager.init(args);
ConfigManager configManager = ConfigManager.getInstance();
String arch = System.getProperty("os.arch");
String mac_arm64_cpu_only = System.getenv().getOrDefault("TS_MAC_ARM64_CPU_ONLY", "False");
if (arch.equals("aarch64")) {
Assert.assertTrue(configManager.getNumberOfGpu() > 0);
if (mac_arm64_cpu_only.equals("True")) {
Assert.assertEquals(configManager.getNumberOfGpu(), 0);
} else {
Assert.assertTrue(configManager.getNumberOfGpu() > 0);
}
}
}
}
58 changes: 52 additions & 6 deletions test/pytest/test_device_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,20 @@
mnist_scriptes_py = os.path.join(REPO_ROOT, "examples/image_classifier/mnist/mnist.py")

HANDLER_PY = """
import torch
from ts.torch_handler.base_handler import BaseHandler
class deviceHandler(BaseHandler):
def initialize(self, context):
super().initialize(context)
if torch.backends.mps.is_available() and context.system_properties.get("gpu_id") is not None:
assert self.get_device().type == "mps"
else:
assert self.get_device().type == "cpu"
"""

HANDLER_PY_GPU = """
from ts.torch_handler.base_handler import BaseHandler
class deviceHandler(BaseHandler):
Expand All @@ -28,6 +42,16 @@ def initialize(self, context):
assert self.get_device().type == "mps"
"""

HANDLER_PY_CPU = """
from ts.torch_handler.base_handler import BaseHandler
class deviceHandler(BaseHandler):
def initialize(self, context):
super().initialize(context)
assert self.get_device().type == "cpu"
"""

MODEL_CONFIG_YAML = """
#frontend settings
# TorchServe frontend parameters
Expand Down Expand Up @@ -78,8 +102,23 @@ def get_config(param):
return get_config(request.param)


@pytest.fixture(scope="module")
def handler_py(request):
def get_handler(param):
if param == "cpu":
return HANDLER_PY_CPU
elif param == "gpu":
return HANDLER_PY_GPU
else:
return HANDLER_PY

return get_handler(request.param)


@pytest.fixture(scope="module", name="mar_file_path")
def create_mar_file(work_dir, model_archiver, model_name, model_config_name):
def create_mar_file(
work_dir, model_archiver, model_name, model_config_name, handler_py
):
mar_file_path = work_dir.joinpath(model_name + ".mar")

model_config_yaml_file = work_dir / "model_config.yaml"
Expand All @@ -90,7 +129,7 @@ def create_mar_file(work_dir, model_archiver, model_name, model_config_name):
model_py_file.write_text(mnist_scriptes_py)

handler_py_file = work_dir / "handler.py"
handler_py_file.write_text(HANDLER_PY)
handler_py_file.write_text(handler_py)

config = ModelArchiverConfig(
model_name=model_name,
Expand Down Expand Up @@ -147,22 +186,29 @@ def register_model(mar_file_path, model_store, torchserve):
test_utils.unregister_model(model_name)


@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
@pytest.mark.skipif(
os.environ.get("TS_MAC_ARM64_CPU_ONLY", "False") == "True",
reason="Skip if running only on MAC CPU",
)
@pytest.mark.parametrize("model_config_name", ["gpu"], indirect=True)
@pytest.mark.parametrize("handler_py", ["gpu"], indirect=True)
def test_m1_device(model_name, model_config_name):
response = requests.get(f"http://localhost:8081/models/{model_name}")
assert response.status_code == 200, "Describe Failed"


@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
@pytest.mark.parametrize("model_config_name", ["cpu"], indirect=True)
@pytest.mark.parametrize("handler_py", ["cpu"], indirect=True)
def test_m1_device_cpu(model_name, model_config_name):
response = requests.get(f"http://localhost:8081/models/{model_name}")
assert response.status_code == 404, "Describe Worked"
assert response.status_code == 200, "Describe Failed"


@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
@pytest.mark.parametrize("model_config_name", ["default"], indirect=True)
@pytest.mark.parametrize("handler_py", ["default"], indirect=True)
def test_m1_device_default(model_name, model_config_name):
response = requests.get(f"http://localhost:8081/models/{model_name}")
assert response.status_code == 200, "Describe Failed"

0 comments on commit 087e813

Please sign in to comment.