-
Notifications
You must be signed in to change notification settings - Fork 0
/
x50.slurm
executable file
·127 lines (100 loc) · 5.62 KB
/
x50.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/bin/bash
#!
#! Based on:
#! Example SLURM job script for Wilkes2 (Broadwell, ConnectX-4, P100)
#! Last updated: Mon 13 Nov 12:06:57 GMT 2017
#! Further updated using new Ampere script:
#! Example SLURM job script for Wilkes3 (AMD EPYC 7763, ConnectX-6, A100)
#! Last updated: Fri 30 Jul 11:07:58 BST 2021
#!
#!#############################################################
#!#### Modify the options in this section as appropriate ######
#!#############################################################
#! sbatch directives begin here ###############################
#! Name of the job:
#SBATCH -J x50_FASHION
#! Which project should be charged (NB Wilkes2 projects end in '-GPU'):
#SBATCH -A T2-CS117-GPU
#! How many whole nodes should be allocated?
#SBATCH --nodes=1
#! How many (MPI) tasks will there be in total?
#! Note probably this should not exceed the total number of GPUs in use.
#SBATCH --ntasks=1
#! Specify the number of GPUs per node (between 1 and 4; must be 4 if nodes>1).
#! Note that the job submission script will enforce no more than 32 cpus per GPU.
#SBATCH --gres=gpu:1
#! What types of email messages do you wish to receive?
#SBATCH --mail-type=FAIL
#! Uncomment this to prevent the job from being requeued (e.g. if
#! interrupted by node failure or system downtime):
##SBATCH --no-requeue
#! Do not change:
#SBATCH -p ampere
#SBATCH -t 35:59:30
#SBATCH --cpus-per-task=32
export TUNE_MAX_PENDING_TRIALS_PG=32
export XLA_PYTHON_CLIENT_PREALLOCATE=false
#! sbatch directives end here (put any additional directives above this line)
#! Notes:
#! Charging is determined by GPU number*walltime.
#! Number of nodes and tasks per node allocated by SLURM (do not change):
numnodes=$SLURM_JOB_NUM_NODES
numtasks=$SLURM_NTASKS
mpi_tasks_per_node=$(echo "$SLURM_TASKS_PER_NODE" | sed -e 's/^\([0-9][0-9]*\).*$/\1/')
#! ############################################################
#! Modify the settings below to specify the application's environment, location
#! and launch method:
#! Optionally modify the environment seen by the application
#! (note that SLURM reproduces the environment at submission irrespective of ~/.bashrc):
. /etc/profile.d/modules.sh # Leave this line (enables the module command)
module purge # Removes all modules still loaded
module load rhel8/default-amp # REQUIRED - loads the basic environment
module load python-3.9.6-gcc-5.4.0-sbr552h
module load cuda/11.1
module load cudnn/8.0_cuda-11.1
#! Insert additional module load commands after this line if needed:
#! dataset and optimiser are passed through from the ray_runner.sh script
group_name="${dataset}_${optimiser}"
write_path="/rds/user/authorid/hpc-work/LearningSecondOrderOptimiser/runs/${group_name}"
#! Full path to application executable:
application="source .venv_3.9/bin/activate && "
declare -a seeds=("2119213981" "1608860012" "1021032354" "280853612" "1415121920" "503407898" "995043888" "333388907" "1971069637" "1335198443" "285161167" "894408494" "952170761" "704127742" "168220153" "48936849" "1822305184" "1550130155" "812730049" "833357148" "1043290698" "369867697" "1119789429" "495194068" "806185573" "980810461" "1323666201" "1112576223" "33383858" "735190115" "2114747825" "153301904" "1417633242" "572670284" "71283607" "545220238" "1708331336" "31319830" "795335164" "698059710" "1298677938" "1248108292" "129243081" "869963795" "1378116027" "73798405" "1729011228" "1539271366" "999822958" "1251819451")
num_parallel_calls=${num_parallel_calls}
#! Run options for the application:
#options="train.py -c configs/${dataset}.yaml configs/optimal_ASHA_${dataset}/${optimiser}.yaml -g optimal_${dataset}_${optimiser}_batch --seed ${seed}"
options="echo ${seeds[@]} | xargs -n 1 -P ${num_parallel_calls} python train.py -c configs/${dataset}.yaml configs/optimal_ASHA_${dataset}/${optimiser}.yaml -g optimal_${dataset}_${optimiser}_batch -n ${optimiser} --seed"
#options="python train.py -c configs/fashion_mnist.yaml configs/Adam.yaml -g test_fashion_adam_jax -n debug"
#! Work directory (i.e. where the job will run):
workdir="$SLURM_SUBMIT_DIR" # The value of SLURM_SUBMIT_DIR sets workdir to the directory
# in which sbatch is run.
#! Are you using OpenMP (NB this is unrelated to OpenMPI)? If so increase this
#! safe value to no more than 128:
export OMP_NUM_THREADS=1
#! Number of MPI tasks to be started by the application per node and in total (do not change):
np=$[${numnodes}*${mpi_tasks_per_node}]
#! Choose this for a pure shared-memory OpenMP parallel program on a single node:
#! (OMP_NUM_THREADS threads will be created):
CMD="$application $options"
#! Choose this for a MPI code using OpenMPI:
#CMD="mpirun -npernode $mpi_tasks_per_node -np $np $application $options"
###############################################################
### You should not have to change anything below this line ####
###############################################################
#!cd $workdir
echo -e "Changed directory to `pwd`.\n"
JOBID=$SLURM_JOB_ID
echo -e "JobID: $JOBID\n======"
echo "Time: `date`"
echo "Running on master node: `hostname`"
echo "Current directory: `pwd`"
if [ "$SLURM_JOB_NODELIST" ]; then
#! Create a machine file:
export NODEFILE=`generate_pbs_nodefile`
cat $NODEFILE | uniq > machine.file.$JOBID
echo -e "\nNodes allocated:\n================"
echo `cat machine.file.$JOBID | sed -e 's/\..*$//g'`
mv machine.file.$JOBID $write_path/machine.file.$JOBID
fi
echo -e "\nnumtasks=$numtasks, numnodes=$numnodes, mpi_tasks_per_node=$mpi_tasks_per_node (OMP_NUM_THREADS=$OMP_NUM_THREADS)"
echo -e "\nExecuting command:\n==================\n$CMD\n"
eval $CMD