Skip to content

Commit

Permalink
Distributed launch.py kill procs at exit (#9019)
Browse files Browse the repository at this point in the history
Patch for distributed launcher script

---------

Co-authored-by: kgajdamo <kinga.gajdamowicz@intel.com>
  • Loading branch information
JakubPietrakIntel and kgajdamo committed Mar 5, 2024
1 parent d491f43 commit db9d8f5
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 26 deletions.
9 changes: 2 additions & 7 deletions examples/distributed/pyg/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ def submit_all_jobs(args, udf_command, dry_run=False):
if len(args.extra_envs) > 0 else cmd)

cmd = cmd[:-1]
cmd += " --logging"
cmd += f" --dataset_root_dir={args.dataset_root_dir}"
cmd += f" --dataset={args.dataset}"
cmd += f" --num_nodes={args.num_nodes}"
Expand All @@ -259,7 +260,7 @@ def submit_all_jobs(args, udf_command, dry_run=False):
cmd += f" --batch_size={args.batch_size}"
cmd += f" --num_workers={args.num_workers}"
cmd += f" --concurrency={args.concurrency}"
cmd += f" --logging --progress_bar --ddp_port={args.ddp_port})"
cmd += f" --ddp_port={args.ddp_port})"
servers_cmd.append(cmd)

if not dry_run:
Expand Down Expand Up @@ -399,12 +400,6 @@ def main():
default=11111,
help="Port used for PyTorch's DDP communication",
)
parser.add_argument(
"--part_config",
type=str,
required=True,
help="File (in workspace) of the partition configuration",
)
parser.add_argument(
"--ip_config",
required=True,
Expand Down
43 changes: 24 additions & 19 deletions examples/distributed/pyg/run_dist.sh
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
#!/bin/bash

CONDA_ENV=/home/XXX/anaconda3/envs/pyg
PYG_WORKSPACE=$PWD
PY_EXEC=${CONDA_ENV}/bin/python
EXEC_SCRIPT=${PYG_WORKSPACE}/node_ogb_cpu.py
USER=user
CONDA_ENV=pygenv
CONDA_DIR="/home/${USER}/anaconda3"
PY_EXEC="${CONDA_DIR}/envs/${CONDA_ENV}/bin/python"
EXEC_SCRIPT="${PYG_WORKSPACE}/node_ogb_cpu.py"
CMD="cd ${PYG_WORKSPACE}; ${PY_EXEC} ${EXEC_SCRIPT}"

# Node number:
NUM_NODES=2

# Dataset folder:
DATASET_ROOT_DIR="/home/XXX/mag/2-parts"

# Dataset name:
DATASET=ogbn-mag
DATASET=ogbn-products

# Dataset folder:
DATASET_ROOT_DIR="../../../data/partitions/${DATASET}/${NUM_NODES}-parts"

# Number of epochs:
NUM_EPOCHS=3
NUM_EPOCHS=10

# The batch size:
BATCH_SIZE=1024

# Fanout per layer:
NUM_NEIGHBORS="5,5,5"

# Number of workers for sampling:
NUM_WORKERS=2
CONCURRENCY=2

# Partition data directory:
PART_CONFIG="/home/XXX/mag/2-parts/ogbn-products-partitions/META.json"
NUM_PARTS=2
CONCURRENCY=4

DDP_PORT=12351

# Fanout per layer:
NUM_NEIGHBORS="15,10,5"
# DDP Port
DDP_PORT=11111

# IP configuration path:
IP_CONFIG=${PYG_WORKSPACE}/ip_config.yaml
Expand All @@ -40,9 +40,14 @@ IP_CONFIG=${PYG_WORKSPACE}/ip_config.yaml
logdir="logs"
mkdir -p "logs"
logname=log_${DATASET}_${NUM_PARTS}_$RANDOM
echo $logname
echo "stdout stored in ${PYG_WORKSPACE}/${logdir}/${logname}"
set -x

# stdout stored in `/logdir/logname.out`.
python launch.py --workspace "${PYG_WORKSPACE}" --num_nodes ${NUM_NODES} --num_neighbors ${NUM_NEIGHBORS} --dataset_root_dir ${DATASET_ROOT_DIR} --dataset ${DATASET} --num_epochs ${NUM_EPOCHS} --batch_size ${BATCH_SIZE} --num_workers ${NUM_WORKERS} --concurrency ${CONCURRENCY} --ddp_port ${DDP_PORT} --part_config ${PART_CONFIG} --ip_config "${IP_CONFIG}" "cd /home/XXX; source ${CONDA_ENV}/bin/activate; cd ${PYG_WORKSPACE}; ${PY_EXEC} ${EXEC_SCRIPT}" |& tee ${logdir}/${logname}.txt
python launch.py --workspace ${PYG_WORKSPACE} --ip_config ${IP_CONFIG} --ssh_username ${USER} --num_nodes ${NUM_NODES} --num_neighbors ${NUM_NEIGHBORS} --dataset_root_dir ${DATASET_ROOT_DIR} --dataset ${DATASET} --num_epochs ${NUM_EPOCHS} --batch_size ${BATCH_SIZE} --num_workers ${NUM_WORKERS} --concurrency ${CONCURRENCY} --ddp_port ${DDP_PORT} "${CMD}" |& tee ${logdir}/${logname} &
pid=$!
echo "started launch.py: ${pid}"
# kill processes at script exit (Ctrl + C)
trap "kill -2 $pid" SIGINT
wait $pid
set +x

0 comments on commit db9d8f5

Please sign in to comment.