Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CI and Regression workflows for MAC Arm64 #3128

Merged
merged 12 commits into from
May 9, 2024
13 changes: 8 additions & 5 deletions .github/workflows/ci_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,19 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-20.04, macOS-latest]
os: [ubuntu-20.04, macos-latest]
steps:
- name: Setup Python for M1
if: matrix.os == 'macos-14'
if: matrix.os == 'macos-latest'
uses: actions/setup-python@v5
with:
python-version: '3.10'
architecture: arm64
- name: Setup Python for all other OS
if: matrix.os != 'macos-14'
if: matrix.os != 'macos-latest'
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.9'
architecture: x64
- name: Setup Java 17
uses: actions/setup-java@v3
Expand All @@ -47,7 +48,9 @@ jobs:
run: |
python ts_scripts/install_dependencies.py --environment=dev
- name: Torchserve Sanity
uses: nick-fields/retry@v2
env:
TS_MAC_ARM64_CPU_ONLY: ${{ matrix.os == 'macos-latest' && 'True' || 'False' }}
uses: nick-fields/retry@v3
with:
timeout_minutes: 60
max_attempts: 3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
run: |
python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121
- name: Torchserve Sanity
uses: nick-fields/retry@v2
uses: nick-fields/retry@v3
with:
timeout_minutes: 60
retry_on: error
Expand Down
13 changes: 8 additions & 5 deletions .github/workflows/regression_tests_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,24 @@ concurrency:

jobs:
regression-cpu:
# creates workflows for OS: ubuntu, macOS, macOS M1
# creates workflows for OS: ubuntu, macOS M1
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-20.04, macOS-latest]
os: [ubuntu-20.04, macos-latest]
steps:
- name: Setup Python for M1
if: matrix.os == 'macos-14'
if: matrix.os == 'macos-latest'
uses: actions/setup-python@v5
with:
python-version: '3.10'
architecture: arm64
- name: Setup Python for all other OS
if: matrix.os != 'macos-14'
if: matrix.os != 'macos-latest'
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.9'
architecture: x64
- name: Setup Java 17
uses: actions/setup-java@v3
Expand All @@ -46,5 +47,7 @@ jobs:
run: |
python ts_scripts/install_dependencies.py --environment=dev
- name: Torchserve Regression Tests
env:
TS_MAC_ARM64_CPU_ONLY: ${{ matrix.os == 'macos-latest' && 'True' || 'False' }}
run: |
python test/regression_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,8 @@ private static int getAvailableGpu() {
}
}
}
throw new AssertionError("Unexpected response.");
// No MPS devices detected
return 0;
} else {
Process process =
Runtime.getRuntime().exec("nvidia-smi --query-gpu=index --format=csv");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,13 @@ public void testNumGpuM1() throws ReflectiveOperationException, IOException {
ConfigManager.init(args);
ConfigManager configManager = ConfigManager.getInstance();
String arch = System.getProperty("os.arch");
String mac_arm64_cpu_only = System.getenv().getOrDefault("TS_MAC_ARM64_CPU_ONLY", "False");
if (arch.equals("aarch64")) {
Assert.assertTrue(configManager.getNumberOfGpu() > 0);
if (mac_arm64_cpu_only.equals("True")) {
Assert.assertEquals(configManager.getNumberOfGpu(), 0);
} else {
Assert.assertTrue(configManager.getNumberOfGpu() > 0);
}
}
}
}
58 changes: 52 additions & 6 deletions test/pytest/test_device_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,20 @@
mnist_scriptes_py = os.path.join(REPO_ROOT, "examples/image_classifier/mnist/mnist.py")

HANDLER_PY = """
import torch
from ts.torch_handler.base_handler import BaseHandler

class deviceHandler(BaseHandler):

def initialize(self, context):
super().initialize(context)
if torch.backends.mps.is_available() and context.system_properties.get("gpu_id") is not None:
assert self.get_device().type == "mps"
else:
assert self.get_device().type == "cpu"
"""

HANDLER_PY_GPU = """
from ts.torch_handler.base_handler import BaseHandler

class deviceHandler(BaseHandler):
Expand All @@ -28,6 +42,16 @@ def initialize(self, context):
assert self.get_device().type == "mps"
"""

HANDLER_PY_CPU = """
from ts.torch_handler.base_handler import BaseHandler

class deviceHandler(BaseHandler):

def initialize(self, context):
super().initialize(context)
assert self.get_device().type == "cpu"
"""

MODEL_CONFIG_YAML = """
#frontend settings
# TorchServe frontend parameters
Expand Down Expand Up @@ -78,8 +102,23 @@ def get_config(param):
return get_config(request.param)


@pytest.fixture(scope="module")
def handler_py(request):
def get_handler(param):
if param == "cpu":
return HANDLER_PY_CPU
elif param == "gpu":
return HANDLER_PY_GPU
else:
return HANDLER_PY

return get_handler(request.param)


@pytest.fixture(scope="module", name="mar_file_path")
def create_mar_file(work_dir, model_archiver, model_name, model_config_name):
def create_mar_file(
work_dir, model_archiver, model_name, model_config_name, handler_py
):
mar_file_path = work_dir.joinpath(model_name + ".mar")

model_config_yaml_file = work_dir / "model_config.yaml"
Expand All @@ -90,7 +129,7 @@ def create_mar_file(work_dir, model_archiver, model_name, model_config_name):
model_py_file.write_text(mnist_scriptes_py)

handler_py_file = work_dir / "handler.py"
handler_py_file.write_text(HANDLER_PY)
handler_py_file.write_text(handler_py)

config = ModelArchiverConfig(
model_name=model_name,
Expand Down Expand Up @@ -147,22 +186,29 @@ def register_model(mar_file_path, model_store, torchserve):
test_utils.unregister_model(model_name)


@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
@pytest.mark.skipif(
os.environ.get("TS_MAC_ARM64_CPU_ONLY", "False") == "True",
reason="Skip if running only on MAC CPU",
)
@pytest.mark.parametrize("model_config_name", ["gpu"], indirect=True)
@pytest.mark.parametrize("handler_py", ["gpu"], indirect=True)
def test_m1_device(model_name, model_config_name):
response = requests.get(f"http://localhost:8081/models/{model_name}")
assert response.status_code == 200, "Describe Failed"


@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
@pytest.mark.parametrize("model_config_name", ["cpu"], indirect=True)
@pytest.mark.parametrize("handler_py", ["cpu"], indirect=True)
def test_m1_device_cpu(model_name, model_config_name):
response = requests.get(f"http://localhost:8081/models/{model_name}")
assert response.status_code == 404, "Describe Worked"
assert response.status_code == 200, "Describe Failed"


@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
@pytest.mark.parametrize("model_config_name", ["default"], indirect=True)
@pytest.mark.parametrize("handler_py", ["default"], indirect=True)
def test_m1_device_default(model_name, model_config_name):
response = requests.get(f"http://localhost:8081/models/{model_name}")
assert response.status_code == 200, "Describe Failed"
Loading