Skip to content

compute canada

Julien Cohen-Adad edited this page Sep 30, 2020 · 12 revisions

Load environment

# activate python
module load python/3.7
# create virtual environment
virtualenv NAME_VENV
# activate it
source NAME_VENV/bin/activate
# deactivate
deactivate

Logbook

This page is a logbook of my experiments with running jobs on Compute Canada, trying to find the best configuration for allocating resources.

data-single-subject

experiment 1

#SBATCH --time=0-02:00        # time (DD-HH:MM)
#SBATCH --ntasks=20           # number of MPI processes
#SBATCH --mem-per-cpu=8192    # memory; default unit is megabytes

   Account      User        JobID               Start                 End  AllocCPUS    Elapsed                      AllocTRES    CPUTime     AveRSS     MaxRSS MaxRSSTask MaxRSSNode        NodeList ExitCode                State 
---------- --------- ------------ ------------------- ------------------- ---------- ---------- ------------------------------ ---------- ---------- ---------- ---------- ---------- --------------- -------- -------------------- 
def-jcohe+    jcohen 46115600     2020-07-14T11:39:54 2020-07-14T12:38:24         20   00:58:30 billing=40,cpu=20,mem=160G,no+   19:30:00                                                cdr[669,699]      0:0            COMPLETED 
def-jcohe+           46115600.ba+ 2020-07-14T11:39:54 2020-07-14T12:38:24         15   00:58:30         cpu=15,mem=120G,node=1   14:37:30  10773879K  10773879K          0     cdr669          cdr669      0:0            COMPLETED 
def-jcohe+           46115600.ex+ 2020-07-14T11:39:54 2020-07-14T12:38:24         20   00:58:30 billing=40,cpu=20,mem=160G,no+   19:30:00    247.50K       285K          1     cdr699    cdr[669,699]      0:0            COMPLETED 

experiment 2

#SBATCH --time=0-01:00        # time (DD-HH:MM)
#SBATCH --ntasks=20           # number of MPI processes
#SBATCH --mem-per-cpu=16384   # memory; default unit is megabytes

   Account      User        JobID               Start                 End  AllocCPUS    Elapsed                      AllocTRES    CPUTime     AveRSS     MaxRSS MaxRSSTask MaxRSSNode        NodeList ExitCode                State 
---------- --------- ------------ ------------------- ------------------- ---------- ---------- ------------------------------ ---------- ---------- ---------- ---------- ---------- --------------- -------- -------------------- 
def-jcohe+    jcohen 45670476     2020-07-08T18:18:12 2020-07-08T18:48:34         20   00:30:22 billing=80,cpu=20,mem=320G,no+   10:07:20                                                      cdr471      0:0            COMPLETED 
def-jcohe+           45670476.ba+ 2020-07-08T18:18:12 2020-07-08T18:48:34         20   00:30:22         cpu=20,mem=320G,node=1   10:07:20  11860564K  11860564K          0     cdr471          cdr471      0:0            COMPLETED 
def-jcohe+           45670476.ex+ 2020-07-08T18:18:12 2020-07-08T18:48:35         20   00:30:23 billing=80,cpu=20,mem=320G,no+   10:07:40        96K        96K          0     cdr471          cdr471      0:0            COMPLETED 

data-multi-subject

multi-cedar-ntasks250-mempercpu16384

#SBATCH --time=0-05:00        # time (DD-HH:MM)
#SBATCH --ntasks=250          # number of MPI processes
#SBATCH --mem-per-cpu=16384   # memory; default unit is megabytes

   Account      User        JobID               Start                 End  AllocCPUS    Elapsed                      AllocTRES    CPUTime     AveRSS     MaxRSS MaxRSSTask MaxRSSNode        NodeList ExitCode                State 
---------- --------- ------------ ------------------- ------------------- ---------- ---------- ------------------------------ ---------- ---------- ---------- ---------- ---------- --------------- -------- -------------------- 
def-jcohe+    jcohen 46139273     2020-07-14T21:03:27 2020-07-15T01:04:16        250   04:00:49 billing=1000,cpu=250,mem=4000+ 41-19:24:10                                             cdr[1764-1765,+    0:125        OUT_OF_MEMORY 
def-jcohe+           46139273.ba+ 2020-07-14T21:03:27 2020-07-15T01:04:16          1   04:00:49           cpu=1,mem=16G,node=1   04:00:49  16578593K  16578593K          0    cdr1764         cdr1764    0:125        OUT_OF_MEMORY 
def-jcohe+           46139273.ex+ 2020-07-14T21:03:27 2020-07-15T01:04:19        250   04:00:52 billing=1000,cpu=250,mem=4000+ 41-19:36:40     416329       575K          2    cdr1767 cdr[1764-1765,+      0:0            COMPLETED 

multi-cedar-ntasks40-mempercpu16384

#SBATCH --time=0-09:00        # time (DD-HH:MM)
#SBATCH --ntasks=40           # number of MPI processes
#SBATCH --mem-per-cpu=16384   # memory; default unit is megabytes

   Account      User        JobID               Start                 End  AllocCPUS    Elapsed                      AllocTRES    CPUTime     AveRSS     MaxRSS MaxRSSTask MaxRSSNode        NodeList ExitCode                State 
---------- --------- ------------ ------------------- ------------------- ---------- ---------- ------------------------------ ---------- ---------- ---------- ---------- ---------- --------------- -------- -------------------- 
def-jcohe+    jcohen 46189141     2020-07-15T18:09:53 2020-07-15T20:46:44         40   02:36:51 billing=160,cpu=40,mem=640G,n+ 4-08:34:00                                             cdr[783,787,81+    0:125        OUT_OF_MEMORY 
def-jcohe+           46189141.ba+ 2020-07-15T18:09:53 2020-07-15T20:46:44          3   02:36:51           cpu=3,mem=48G,node=1   07:50:33  41430699K  41430699K          0     cdr783          cdr783    0:125        OUT_OF_MEMORY 
def-jcohe+           46189141.ex+ 2020-07-15T18:09:53 2020-07-15T20:47:03         40   02:37:10 billing=160,cpu=40,mem=640G,n+ 4-08:46:40     223641       502K          3     cdr813 cdr[783,787,81+      0:0            COMPLETED 

Log from one subject:
OS: linux (Linux-3.10.0-1062.12.1.el7.x86_64-x86_64-with-centos-7.7.1908-Core)
CPU cores: Available: 32, Used by SCT: 1
RAM: MemTotal:       528295492 kB
total        used        free      shared  buff/cache   available
Mem:         515913       80352      411533        2416       24027      430505
Swap:             0           0           0

multi-niagara-nodes4-cpus80

#SBATCH --time=0-09:00        # time (DD-HH:MM)
#SBATCH --nodes=4
#SBATCH --cpus-per-task=80   # number of OpenMP processes

       JobID      User         Account AllocNodes               Start                 End    Elapsed                      AllocTRES    CPUTime        NodeList ExitCode       State 
------------ --------- --------------- ---------- ------------------- ------------------- ---------- ------------------------------ ---------- --------------- -------- ----------- 
     3713484    jcohen      def-jcohen          4 2020-07-16T12:07:14 2020-07-16T15:12:10   03:04:56     billing=160,cpu=320,node=4 41-02:18:40 nia[0040,0060,+      0:0   COMPLETED 
3713484.bat+                def-jcohen          1 2020-07-16T12:07:14 2020-07-16T15:12:10   03:04:56            cpu=80,mem=0,node=1 10-06:34:40         nia0040      0:0   COMPLETED 
3713484.ext+                def-jcohen          4 2020-07-16T12:07:14 2020-07-16T15:12:10   03:04:56     billing=160,cpu=320,node=4 41-02:18:40 nia[0040,0060,+      0:0   COMPLETED 

Log from one subject:
OS: linux (Linux-3.10.0-957.27.2.el7.x86_64-x86_64-with-centos-7.6.1810-Core)
CPU cores: Available: 80, Used by SCT: 1
RAM: MemTotal:       197698400 kB
total        used        free      shared  buff/cache   available
Mem:         193064       37345      125875       29052       29844      125520
Swap:             0           0           0

Observed in a few subjects:
MemoryError: Unable to allocate 600. MiB for an array with shape (384, 640, 320) and data type float64
Proof that parallelization worked (80 subjects processed simultaneously)
Started at 12h07m40s: sub-nottwil06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-nottwil06.log
Started at 12h07m40s: sub-perform06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-perform06.log
Started at 12h07m40s: sub-cmrra02. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-cmrra02.log
Started at 12h07m40s: sub-amu04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-amu04.log
Started at 12h07m40s: sub-oxfordFmrib04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-oxfordFmrib04.log
Started at 12h07m40s: sub-perform01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-perform01.log
Started at 12h07m40s: sub-tokyoIngenia06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyoIngenia06.log
Started at 12h07m40s: sub-mgh03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mgh03.log
Started at 12h07m40s: sub-queensland06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-queensland06.log
Started at 12h07m40s: sub-tokyoIngenia02. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyoIngenia02.log
Started at 12h07m40s: sub-tokyoIngenia01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyoIngenia01.log
Started at 12h07m40s: sub-sapienza05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-sapienza05.log
Started at 12h07m40s: sub-unf07. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-unf07.log
Started at 12h07m40s: sub-brnoUhb03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-brnoUhb03.log
Started at 12h07m40s: sub-cmrrb06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-cmrrb06.log
Started at 12h07m40s: sub-oxfordFmrib06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-oxfordFmrib06.log
Started at 12h07m40s: sub-tokyoSkyra03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyoSkyra03.log
Started at 12h07m40s: sub-brnoUhb01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-brnoUhb01.log
Started at 12h07m40s: sub-cmrrb02. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-cmrrb02.log
Started at 12h07m40s: sub-balgrist05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-balgrist05.log
Started at 12h07m40s: sub-unf05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-unf05.log
Started at 12h07m40s: sub-ucl03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-ucl03.log
Started at 12h07m40s: sub-ucl05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-ucl05.log
Started at 12h07m40s: sub-vallHebron07. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-vallHebron07.log
Started at 12h07m40s: sub-dresden01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-dresden01.log
Started at 12h07m40s: sub-vuiisIngenia01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-vuiisIngenia01.log
Started at 12h07m40s: sub-brnoCeitec01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-brnoCeitec01.log
Started at 12h07m40s: sub-hamburg05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-hamburg05.log
Started at 12h07m40s: sub-brnoCeitec03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-brnoCeitec03.log
Started at 12h07m40s: sub-strasbourg01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-strasbourg01.log
Started at 12h07m40s: sub-sherbrooke03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-sherbrooke03.log
Started at 12h07m40s: sub-hamburg06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-hamburg06.log
Started at 12h07m40s: sub-oxfordOhba01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-oxfordOhba01.log
Started at 12h07m40s: sub-mniS09. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mniS09.log
Started at 12h07m40s: sub-hamburg02. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-hamburg02.log
Started at 12h07m40s: sub-tokyoSkyra06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyoSkyra06.log
Started at 12h07m40s: sub-geneva03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-geneva03.log
Started at 12h07m40s: sub-mniS05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mniS05.log
Started at 12h07m40s: sub-nottwil04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-nottwil04.log
Started at 12h07m40s: sub-cmrrb04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-cmrrb04.log
Started at 12h07m40s: sub-ubc03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-ubc03.log
Started at 12h07m40s: sub-brnoCeitec04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-brnoCeitec04.log
Started at 12h07m40s: sub-ucl06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-ucl06.log
Started at 12h07m40s: sub-beijingVerio03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-beijingVerio03.log
Started at 12h07m40s: sub-vuiisAchieva06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-vuiisAchieva06.log
Started at 12h07m40s: sub-tokyo750w07. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyo750w07.log
Started at 12h07m40s: sub-brnoUhb06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-brnoUhb06.log
Started at 12h07m40s: sub-vallHebron02. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-vallHebron02.log
Started at 12h07m40s: sub-mpicbs07. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mpicbs07.log
Started at 12h07m40s: sub-vuiisIngenia06. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-vuiisIngenia06.log
Started at 12h07m40s: sub-mgh04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mgh04.log
Started at 12h07m40s: sub-ubc04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-ubc04.log
Started at 12h07m40s: sub-tokyo750w02. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyo750w02.log
Started at 12h07m40s: sub-beijingPrisma01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-beijingPrisma01.log
Started at 12h07m40s: sub-mountSinai03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mountSinai03.log
Started at 12h07m40s: sub-cmrra04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-cmrra04.log
Started at 12h07m40s: sub-cmrrb07. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-cmrrb07.log
Started at 12h07m40s: sub-oxfordOhba05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-oxfordOhba05.log
Started at 12h07m40s: sub-nwu03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-nwu03.log
Started at 12h07m40s: sub-cmrrb05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-cmrrb05.log
Started at 12h07m40s: sub-tokyoIngenia05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyoIngenia05.log
Started at 12h07m40s: sub-queensland03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-queensland03.log
Started at 12h07m40s: sub-strasbourg05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-strasbourg05.log
Started at 12h07m40s: sub-nwu05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-nwu05.log
Started at 12h07m40s: sub-queensland04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-queensland04.log
Started at 12h07m40s: sub-mniS07. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mniS07.log
Started at 12h07m40s: sub-tokyo750w01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyo750w01.log
Started at 12h07m40s: sub-vallHebron05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-vallHebron05.log
Started at 12h07m40s: sub-amu01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-amu01.log
Started at 12h07m40s: sub-barcelona03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-barcelona03.log
Started at 12h07m40s: sub-beijingGE04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-beijingGE04.log
Started at 12h07m40s: sub-mgh05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mgh05.log
Started at 12h07m40s: sub-strasbourg04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-strasbourg04.log
Started at 12h07m40s: sub-tokyo750w04. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tokyo750w04.log
Started at 12h07m40s: sub-mpicbs05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mpicbs05.log
Started at 12h07m40s: sub-unf02. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-unf02.log
Started at 12h07m40s: sub-mniS01. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-mniS01.log
Started at 12h07m40s: sub-brnoCeitec02. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-brnoCeitec02.log
Started at 12h07m40s: sub-brnoUhb05. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-brnoUhb05.log
Started at 12h07m40s: sub-tehranS03. See log file /scratch/j/jcohen/jcohen/results-multi/log/process_data_sub-tehranS03.log

multi-niagara-nodes8-cpus80

#SBATCH --time=0-09:00        # time (DD-HH:MM)
#SBATCH --nodes=8
#SBATCH --cpus-per-task=80   # number of OpenMP processes

MemoryError: Unable to allocate 1.30 GiB for an array with shape (58232983, 3) and data type int64
--> job canceled by user

multi-niagara-nodes4-cpus20

#SBATCH --time=0-09:00        # time (DD-HH:MM)
#SBATCH --nodes=4
#SBATCH --cpus-per-task=20    # number of OpenMP processes

MemoryError: 
--> job canceled by user