From 419e4c49d07afbbb9abd3c323ce66794410b4ed8 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 31 Jan 2018 06:38:17 +0000 Subject: [PATCH 1/7] modify some --- benchmark/cluster/vgg16/v2_pserver.yaml | 4 ++-- benchmark/cluster/vgg16/v2_trainer.yaml | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml index dd1271e0cf399..857e2ff4557d4 100644 --- a/benchmark/cluster/vgg16/v2_pserver.yaml +++ b/benchmark/cluster/vgg16/v2_pserver.yaml @@ -23,13 +23,13 @@ spec: - name: PADDLE_JOB_NAME value: vgg16v2job - name: TRAINERS - value: "20" + value: "60" - name: PSERVERS value: "10" - name: TOPOLOGY value: "" - name: ENTRY - value: "python train.py" + value: "python -u train.py" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml index 997bbc81c99ae..be0f741b349c4 100644 --- a/benchmark/cluster/vgg16/v2_trainer.yaml +++ b/benchmark/cluster/vgg16/v2_trainer.yaml @@ -3,8 +3,8 @@ kind: Job metadata: name: vgg16v2job-trainer spec: - parallelism: 20 - completions: 20 + parallelism: 60 + completions: 60 template: metadata: labels: @@ -24,13 +24,13 @@ spec: - name: BATCH_SIZE value: "256" - name: TRAINERS - value: "20" + value: "60" - name: PSERVERS value: "10" - name: TOPOLOGY value: "" - name: ENTRY - value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py" + value: "cd /workspace && MKL_NUM_THREADS=1 python -u /workspace/vgg16_v2.py" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT From 38b8b7f6acb51e62b97a62e3215d39b0d6f7553b Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 31 Jan 2018 09:09:32 +0000 Subject: [PATCH 2/7] add results --- benchmark/cluster/README.md | 10 +++++----- benchmark/cluster/vgg16/README.md | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md index b619613ea7a5b..c2be886b34a5a 100644 --- a/benchmark/cluster/README.md +++ b/benchmark/cluster/README.md @@ -44,14 +44,14 @@ ### Measure the Performance for Different PServer Count -- Trainer Count: 100 -- Batch Size: 64 +- Trainer Count: 60 +- Batch Size: 128 - Metrics: mini-batch / sec -| PServer Count | 10 | 20 | 40 | 60 | +| PServer Count | 3 | 6 | 10 | 20 | | -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | - | - | - | - | -| PaddlePaddle v2 | - | - | - | - | +| PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 | +| PaddlePaddle v2 | 412.2 | 368.4 | 346.8 | 283.2 | | TensorFlow | - | - | - | - | ### Measure Parallel Efficiency By Increasing Trainer Count diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md index c1e85a2c40790..333e14250bb0f 100644 --- a/benchmark/cluster/vgg16/README.md +++ b/benchmark/cluster/vgg16/README.md @@ -48,14 +48,14 @@ ### different pserver number -- Trainer Count: 100 +- Trainer Count: 60 - Batch Size: 128 - Metrics: mini-batch / sec -| PServer Count | 10 | 20 | 40 | 60 | +| PServer Count | 3 | 6 |10 | 20 | | -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | - | - | - | - | -| PaddlePaddle v2 | - | - | - | - | +| PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 | +| PaddlePaddle v2 | 412.2 | 368.4 | 346.8 | 283.2 | | TensorFlow | - | - | - | - | From cfbbb9841d3ab9f6736cd7e02273fe8dc7a1df39 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 31 Jan 2018 09:18:35 +0000 Subject: [PATCH 3/7] clean code --- benchmark/cluster/README.md | 10 +++++----- benchmark/cluster/vgg16/v2_pserver.yaml | 2 +- benchmark/cluster/vgg16/v2_trainer.yaml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md index c2be886b34a5a..b619613ea7a5b 100644 --- a/benchmark/cluster/README.md +++ b/benchmark/cluster/README.md @@ -44,14 +44,14 @@ ### Measure the Performance for Different PServer Count -- Trainer Count: 60 -- Batch Size: 128 +- Trainer Count: 100 +- Batch Size: 64 - Metrics: mini-batch / sec -| PServer Count | 3 | 6 | 10 | 20 | +| PServer Count | 10 | 20 | 40 | 60 | | -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 | -| PaddlePaddle v2 | 412.2 | 368.4 | 346.8 | 283.2 | +| PaddlePaddle Fluid | - | - | - | - | +| PaddlePaddle v2 | - | - | - | - | | TensorFlow | - | - | - | - | ### Measure Parallel Efficiency By Increasing Trainer Count diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml index 857e2ff4557d4..935cf0be3cdb0 100644 --- a/benchmark/cluster/vgg16/v2_pserver.yaml +++ b/benchmark/cluster/vgg16/v2_pserver.yaml @@ -23,7 +23,7 @@ spec: - name: PADDLE_JOB_NAME value: vgg16v2job - name: TRAINERS - value: "60" + value: "20" - name: PSERVERS value: "10" - name: TOPOLOGY diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml index be0f741b349c4..5189009f3e1c5 100644 --- a/benchmark/cluster/vgg16/v2_trainer.yaml +++ b/benchmark/cluster/vgg16/v2_trainer.yaml @@ -3,8 +3,8 @@ kind: Job metadata: name: vgg16v2job-trainer spec: - parallelism: 60 - completions: 60 + parallelism: 20 + completions: 20 template: metadata: labels: @@ -24,7 +24,7 @@ spec: - name: BATCH_SIZE value: "256" - name: TRAINERS - value: "60" + value: "20" - name: PSERVERS value: "10" - name: TOPOLOGY From c98b40e4783a9222674c280c957837b1255c2844 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 1 Feb 2018 16:06:40 +0800 Subject: [PATCH 4/7] clean code --- benchmark/cluster/vgg16/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md index 333e14250bb0f..725ce5902575b 100644 --- a/benchmark/cluster/vgg16/README.md +++ b/benchmark/cluster/vgg16/README.md @@ -55,7 +55,7 @@ | PServer Count | 3 | 6 |10 | 20 | | -- | -- | -- | -- | -- | | PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 | -| PaddlePaddle v2 | 412.2 | 368.4 | 346.8 | 283.2 | +| PaddlePaddle v2 | - | - | 729.7 | - | | TensorFlow | - | - | - | - | From 5530212defd0afd81e202f9e90a499823daf797f Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 1 Feb 2018 16:33:03 +0800 Subject: [PATCH 5/7] add others --- benchmark/cluster/vgg16/README.md | 4 +++- benchmark/cluster/vgg16/v2_pserver.yaml | 2 +- benchmark/cluster/vgg16/v2_trainer.yaml | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md index 725ce5902575b..b0bdc0288f6dc 100644 --- a/benchmark/cluster/vgg16/README.md +++ b/benchmark/cluster/vgg16/README.md @@ -48,6 +48,8 @@ ### different pserver number +*The performance gap between Fuild and v2 comes from the network interference.* + - Trainer Count: 60 - Batch Size: 128 - Metrics: mini-batch / sec @@ -55,7 +57,7 @@ | PServer Count | 3 | 6 |10 | 20 | | -- | -- | -- | -- | -- | | PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 | -| PaddlePaddle v2 | - | - | 729.7 | - | +| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 | | TensorFlow | - | - | - | - | diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml index 935cf0be3cdb0..dd1271e0cf399 100644 --- a/benchmark/cluster/vgg16/v2_pserver.yaml +++ b/benchmark/cluster/vgg16/v2_pserver.yaml @@ -29,7 +29,7 @@ spec: - name: TOPOLOGY value: "" - name: ENTRY - value: "python -u train.py" + value: "python train.py" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml index 5189009f3e1c5..997bbc81c99ae 100644 --- a/benchmark/cluster/vgg16/v2_trainer.yaml +++ b/benchmark/cluster/vgg16/v2_trainer.yaml @@ -30,7 +30,7 @@ spec: - name: TOPOLOGY value: "" - name: ENTRY - value: "cd /workspace && MKL_NUM_THREADS=1 python -u /workspace/vgg16_v2.py" + value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py" - name: TRAINER_PACKAGE value: "/workspace" - name: PADDLE_INIT_PORT From ccef94a376aed4bc8576597f05fc8b00e37ab999 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 1 Feb 2018 16:42:39 +0800 Subject: [PATCH 6/7] add comments --- benchmark/cluster/vgg16/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md index b0bdc0288f6dc..0d525e952258a 100644 --- a/benchmark/cluster/vgg16/README.md +++ b/benchmark/cluster/vgg16/README.md @@ -48,18 +48,18 @@ ### different pserver number -*The performance gap between Fuild and v2 comes from the network interference.* - - Trainer Count: 60 - Batch Size: 128 - Metrics: mini-batch / sec | PServer Count | 3 | 6 |10 | 20 | | -- | -- | -- | -- | -- | -| PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 | +| PaddlePaddle Fluid(should fixed in next PR) | 589.1 | 592.6 | 656.4 | 655.8 | | PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 | | TensorFlow | - | - | - | - | +*The performance gap between Fuild and v2 comes from the network interference.* + ## Steps to run the performance test From 00b9aed0060acd983dce1d3cd1db8a859ec21219 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 1 Feb 2018 16:54:53 +0800 Subject: [PATCH 7/7] fix typo --- benchmark/cluster/vgg16/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md index 0d525e952258a..27eb265ce4698 100644 --- a/benchmark/cluster/vgg16/README.md +++ b/benchmark/cluster/vgg16/README.md @@ -54,7 +54,7 @@ | PServer Count | 3 | 6 |10 | 20 | | -- | -- | -- | -- | -- | -| PaddlePaddle Fluid(should fixed in next PR) | 589.1 | 592.6 | 656.4 | 655.8 | +| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 | | PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 | | TensorFlow | - | - | - | - |