/
generic.py
1238 lines (1060 loc) · 49.3 KB
/
generic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# coding: utf-8
# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
# Distributed under the terms of "New BSD License", see the LICENSE file.
from __future__ import print_function
# import copy
import signal
from datetime import datetime
import os
# import sys
import posixpath
import psutil
# import multiprocessing
from pyiron.base.job.wrapper import JobWrapper
from pyiron.base.settings.generic import Settings
from pyiron.base.job.executable import Executable
from pyiron.base.job.jobstatus import JobStatus
from pyiron.base.job.core import JobCore
from pyiron.base.server.generic import Server
import subprocess
import shutil
import warnings
"""
Generic Job class extends the JobCore class with all the functionality to run the job object.
"""
__author__ = "Joerg Neugebauer, Jan Janssen"
__copyright__ = "Copyright 2017, Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department"
__version__ = "1.0"
__maintainer__ = "Jan Janssen"
__email__ = "janssen@mpie.de"
__status__ = "production"
__date__ = "Sep 1, 2017"
s = Settings()
intercepted_signals=[signal.SIGINT, signal.SIGTERM, signal.SIGABRT] #, signal.SIGQUIT]
class GenericJob(JobCore):
"""
Generic Job class extends the JobCore class with all the functionality to run the job object. From this class
all specific Hamiltonians are derived. Therefore it should contain the properties/routines common to all jobs.
The functions in this module should be as generic as possible.
Args:
project (ProjectHDFio): ProjectHDFio instance which points to the HDF5 file the job is stored in
job_name (str): name of the job, which has to be unique within the project
Attributes:
.. attribute:: job_name
name of the job, which has to be unique within the project
.. attribute:: status
execution status of the job, can be one of the following [initialized, appended, created, submitted, running,
aborted, collect, suspended, refresh, busy, finished]
.. attribute:: job_id
unique id to identify the job in the pyiron database
.. attribute:: parent_id
job id of the predecessor job - the job which was executed before the current one in the current job series
.. attribute:: master_id
job id of the master job - a meta job which groups a series of jobs, which are executed either in parallel or in
serial.
.. attribute:: child_ids
list of child job ids - only meta jobs have child jobs - jobs which list the meta job as their master
.. attribute:: project
Project instance the jobs is located in
.. attribute:: project_hdf5
ProjectHDFio instance which points to the HDF5 file the job is stored in
.. attribute:: job_info_str
short string to describe the job by it is job_name and job ID - mainly used for logging
.. attribute:: working_directory
working directory of the job is executed in - outside the HDF5 file
.. attribute:: path
path to the job as a combination of absolute file system path and path within the HDF5 file.
.. attribute:: version
Version of the hamiltonian, which is also the version of the executable unless a custom executable is used.
.. attribute:: executable
Executable used to run the job - usually the path to an external executable.
.. attribute:: library_activated
For job types which offer a Python library pyiron can use the python library instead of an external executable.
.. attribute:: server
Server object to handle the execution environment for the job.
.. attribute:: queue_id
the ID returned from the queuing system - it is most likely not the same as the job ID.
.. attribute:: logger
logger object to monitor the external execution and internal pyiron warnings.
.. attribute:: restart_file_list
list of files which are used to restart the calculation from these files.
.. attribute:: job_type
Job type object with all the available job types: ['ExampleJob', 'SerialMaster', 'ParallelMaster', 'ScriptJob',
'ListMaster']
"""
def __init__(self, project, job_name):
super(GenericJob, self).__init__(project, job_name)
self.__name__ = "GenericJob"
self.__version__ = "0.4"
self._server = Server()
self._logger = s.logger
self._executable = None
self._import_directory = None
self._status = JobStatus(db=project.db, job_id=self.job_id)
self.refresh_job_status()
self._restart_file_list = list()
self._restart_file_dict = dict()
self._process = None
for sig in intercepted_signals:
signal.signal(sig, self.signal_intercept)
def signal_intercept(self,sig,frame):
try:
self._logger.info('Job {} intercept signal {}, job is shutting down'.format(self._job_id, sig))
self.drop_status_to_aborted()
except:
raise
# finally:
# if sig in intercepted_signals:
# sys.exit(0)
def drop_status_to_aborted(self):
self.refresh_job_status()
if not (self.status.finished or self.status.suspended):
self.status.aborted = True
@property
def version(self):
"""
Get the version of the hamiltonian, which is also the version of the executable unless a custom executable is
used.
Returns:
str: version number
"""
if self.__version__:
return self.__version__
else:
self._executable_activate()
if self._executable is not None:
return self._executable.version
else:
return None
@version.setter
def version(self, new_version):
"""
Set the version of the hamiltonian, which is also the version of the executable unless a custom executable is
used.
Args:
new_version (str): version
"""
self._executable_activate()
self._executable.version = new_version
@property
def executable(self):
"""
Get the executable used to run the job - usually the path to an external executable.
Returns:
(str): exectuable path
"""
self._executable_activate()
return self._executable
@executable.setter
def executable(self, exe):
"""
Set the executable used to run the job - usually the path to an external executable.
Args:
exe (str): executable path, if no valid path is provided an executable is chosen based on version.
"""
self._executable_activate()
self._executable.executable_path = exe
@property
def server(self):
"""
Get the server object to handle the execution environment for the job.
Returns:
Server: server object
"""
return self._server
@server.setter
def server(self, server):
"""
Set the server object to handle the execution environment for the job.
Args:
server (Server): server object
"""
self._server = server
@property
def queue_id(self):
"""
Get the queue ID, the ID returned from the queuing system - it is most likely not the same as the job ID.
Returns:
int: queue ID
"""
return self.server.queue_id
@queue_id.setter
def queue_id(self, qid):
"""
Set the queue ID, the ID returned from the queuing system - it is most likely not the same as the job ID.
Args:
qid (int): queue ID
"""
self.server.queue_id = qid
@property
def logger(self):
"""
Get the logger object to monitor the external execution and internal pyiron warnings.
Returns:
logging.getLogger(): logger object
"""
return self._logger
@property
def restart_file_list(self):
"""
Get the list of files which are used to restart the calculation from these files.
Returns:
list: list of files
"""
return self._restart_file_list
@restart_file_list.setter
def restart_file_list(self, filenames):
"""
Append new files to the restart file list - the list of files which are used to restart the calculation from.
Args:
filenames (list):
"""
for f in filenames:
if not (os.path.isfile(f)):
raise IOError("File: {} does not exist".format(f))
self.restart_file_list.append(f)
@property
def restart_file_dict(self):
"""
A dictionary of the new name of the copied restart files
"""
for actual_name in [os.path.basename(f) for f in self._restart_file_list]:
if actual_name not in self._restart_file_dict.keys():
self._restart_file_dict[actual_name] = actual_name
return self._restart_file_dict
@restart_file_dict.setter
def restart_file_dict(self, val):
if not isinstance(val, dict):
raise ValueError("restart_file_dict should be a dictionary!")
else:
self._restart_file_dict = val
@property
def job_type(self):
"""
Job type object with all the available job types: ['ExampleJob', 'SerialMaster', 'ParallelMaster', 'ScriptJob',
'ListMaster']
Returns:
JobTypeChoice: Job type object
"""
return self.project.job_type
@property
def working_directory(self):
"""
Get the working directory of the job is executed in - outside the HDF5 file. The working directory equals the
path but it is represented by the filesystem:
/absolute/path/to/the/file.h5/path/inside/the/hdf5/file
becomes:
/absolute/path/to/the/file_hdf5/path/inside/the/hdf5/file
Returns:
str: absolute path to the working directory
"""
if self._import_directory:
return self._import_directory
elif not self.project_hdf5.working_directory:
self._create_working_directory()
return self.project_hdf5.working_directory
def collect_logfiles(self):
"""
Collect the log files of the external executable and store the information in the HDF5 file. This method has
to be implemented in the individual hamiltonians.
"""
pass
def write_input(self):
"""
Write the input files for the external executable. This method has to be implemented in the individual
hamiltonians.
"""
raise NotImplementedError("write procedure must be defined for derived Hamilton!")
def collect_output(self):
"""
Collect the output files of the external executable and store the information in the HDF5 file. This method has
to be implemented in the individual hamiltonians.
"""
raise NotImplementedError("read procedure must be defined for derived Hamilton!")
def append(self, job):
"""
Metajobs like GenericMaster, ParallelMaster, SerialMaser or ListMaster allow other jobs to be appended. In the
GenericJob definition this is only a template function.
"""
raise NotImplementedError("append procedure must be defined for derived Hamilton!")
def suspend(self):
"""
Suspend the job by storing the object and its state persistently in HDF5 file and exit it.
"""
self.to_hdf()
self.status.suspended = True
self._logger.info('{}, status: {}, job has been suspended'.format(self.job_info_str, self.status))
self.clear_job()
def refresh_job_status(self):
"""
Refresh job status by updating the job status with the status from the database if a job ID is available.
"""
if self.job_id:
self._status = JobStatus(initial_status=self.project.db.get_item_by_id(self.job_id)["status"],
db=self.project.db, job_id=self.job_id)
def clear_job(self):
"""
Convenience function to clear job info after suspend. Mimics deletion of all the job info after suspend in a
local test environment.
"""
del self.__name__
del self.__version__
del self._executable
del self._name
del self._server
del self._logger
del self._parent_id
del self._master_id
del self._import_directory
del self._status
del self._restart_file_list
del self._restart_file_dict
# del self._process
# del self._hdf5
del self._job_id
del self._status
def copy(self):
"""
Copy the GenericJob object which links to the job and its HDF5 file
Returns:
GenericJob: New GenericJob object pointing to the same job
"""
if not self.project_hdf5.file_exists:
delete_file_after_copy = True
else:
delete_file_after_copy = False
self.to_hdf()
self_class = self.__class__
copied_self = self_class(job_name=self.job_name, project=self.project_hdf5.open('..'))
copied_self.from_hdf()
if delete_file_after_copy:
self.project_hdf5.remove_file()
copied_self._job_id = None
return copied_self
def copy_to(self, project=None, new_job_name=None, input_only=False, new_database_entry=True):
"""
Copy the content of the job including the HDF5 file to a new location
Args:
project (ProjectHDFio): project to copy the job to
new_job_name (str): to duplicate the job within the same porject it is necessary to modify the job name
- optional
input_only (bool): [True/False] to copy only the input - default False
new_database_entry (bool): [True/False] to create a new database entry - default True
Returns:
GenericJob: GenericJob object pointing to the new location.
"""
if project is None and new_job_name is None:
raise ValueError('copy_to requires either a new project or a new_job_name.')
if not self.project_hdf5.file_exists:
self.to_hdf()
delete_file_after_copy = True
else:
delete_file_after_copy = False
if project is None and new_job_name is not None:
new_generic_job = self.copy()
new_generic_job.reset_job_id()
if len(self.project_hdf5.h5_path.split('/')) > 2:
new_location = self.project_hdf5.open('../' + new_job_name)
else:
new_location = self.project_hdf5.__class__(self.project, new_job_name, h5_path='/' + new_job_name)
new_generic_job._name = new_job_name
new_generic_job.project_hdf5.copy_to(new_location, maintain_name=False)
new_generic_job.project_hdf5 = new_location
if new_database_entry:
new_generic_job.save()
else:
new_generic_job = super(GenericJob, self).copy_to(project, new_database_entry=new_database_entry)
new_generic_job.reset_job_id(job_id=new_generic_job.job_id)
new_generic_job.from_hdf()
if input_only:
if 'output' in new_generic_job.project_hdf5.list_groups():
del new_generic_job.project_hdf5[posixpath.join(new_generic_job.project_hdf5.h5_path, 'output')]
if delete_file_after_copy:
self.project_hdf5.remove_file()
if project is not None and new_job_name:
new_generic_job.job_name = new_job_name
return new_generic_job
def copy_file_to_working_directory(self, file):
if os.path.isabs(file):
self.restart_file_list.append(file)
else:
self.restart_file_list.append(file)
def copy_template(self, project, new_job_name=None):
"""
Copy the content of the job including the HDF5 file but without the output data to a new location
Args:
project (ProjectHDFio): project to copy the job to
new_job_name (str): to duplicate the job within the same porject it is necessary to modify the job name
- optional
Returns:
GenericJob: GenericJob object pointing to the new location.
"""
return self.copy_to(project=project, new_job_name=new_job_name, input_only=True, new_database_entry=False)
def _kill_child(self):
if not self.server.run_mode.queue and (self.status.running or self.status.submitted):
for proc in psutil.process_iter():
try:
pinfo = proc.as_dict(attrs=['pid', 'cwd'])
except psutil.NoSuchProcess:
pass
else:
if pinfo['cwd'] is not None and pinfo['cwd'].startswith(self.working_directory):
job_process = psutil.Process(pinfo['pid'])
job_process.kill()
def remove_child(self):
"""
internal function to remove command that removes also child jobs.
Do never use this command, since it will destroy the integrity of your project.
"""
self._kill_child()
super(GenericJob, self).remove_child()
def kill(self):
if self.status.running or self.status.submitted:
master_id, parent_id = self.master_id, self.parent_id
self.remove()
self.reset_job_id()
self.master_id, self.parent_id = master_id, parent_id
else:
raise ValueError('The kill() function is only available during the execution of the job.')
def validate_ready_to_run(self):
"""
Validate that the calculation is ready to be executed. By default no generic checks are performed, but one could
check that the input information is complete or validate the consistency of the input at this point.
"""
pass
def reset_job_id(self, job_id=None):
"""
Reset the job id sets the job_id to None in the GenericJob as well as all connected modules like JobStatus.
"""
if job_id is not None:
job_id = int(job_id)
self._job_id = job_id
self._status = JobStatus(db=self.project.db, job_id=self._job_id)
def run(self, run_again=False, repair=False, debug=False, run_mode=None, que_wait_for=None,):
"""
This is the main run function, depending on the job status ['initialized', 'created', 'submitted', 'running',
'collect','finished', 'refresh', 'suspended'] the corresponding run mode is chosen.
Args:
run_again (bool): Delete the existing job and run the simulation again.
repair (bool): Set the job status to created and run the simulation again.
debug (bool): Debug Mode - defines the log level of the subprocess the job is executed in.
run_mode (str): ['modal', 'non_modal', 'queue', 'manual'] overwrites self.server.run_mode
que_wait_for (int): Que ID to wait for before this job is executed.
"""
try:
self._logger.info('run {}, status: {}'.format(self.job_info_str, self.status))
status = self.status.string
if run_mode:
self.server.run_mode = run_mode
if run_again and self.job_id:
self._logger.info("run repair "+str(self.job_id))
status = 'initialized'
master_id, parent_id = self.master_id, self.parent_id
self.remove()
self.reset_job_id()
self.master_id, self.parent_id = master_id, parent_id
if repair and self.job_id and not self.status.finished:
status = 'created'
if status == 'initialized':
self._run_if_new(debug=debug, que_wait_for=que_wait_for)
elif status == 'created':
que_id = self._run_if_created(que_wait_for=que_wait_for)
if que_id:
self._logger.info('{}, status: {}, submitted: queue id {}'.format(self.job_info_str, self.status, que_id))
# print('job was submitted, queue id: ', que_id)
elif status == 'submitted':
self._run_if_submitted()
elif status == 'running':
self._run_if_running()
elif status == 'collect':
self._run_if_collect()
elif status == 'suspend':
self._run_if_suspended()
elif status == 'refresh':
self._run_if_refresh()
elif status == 'busy':
self._run_if_busy()
elif status == 'finished':
self._run_if_finished(run_again=run_again)
except Exception:
self.drop_status_to_aborted()
raise
except KeyboardInterrupt:
self.drop_status_to_aborted()
raise
except SystemExit:
self.drop_status_to_aborted()
raise
def run_if_modal(self):
"""
The run if modal function is called by run to execute the simulation, while waiting for the output. For this we
use subprocess.check_output()
"""
self.run_static()
def run_static(self):
"""
The run static function is called by run to execute the simulation.
"""
self._logger.info('{}, status: {}, run job (modal)'.format(self.job_info_str, self.status))
if self.executable.executable_path == '':
self.status.aborted = True
raise ValueError('No executable set!')
self.status.running = True
self.project.db.item_update({"timestart": datetime.now()}, self.job_id)
job_crashed, out = False, None
try:
if self.server.cores == 1 or not self.executable.mpi:
out = subprocess.check_output(str(self.executable), cwd=self.project_hdf5.working_directory, shell=True,
stderr=subprocess.STDOUT, universal_newlines=True)
else:
out = subprocess.check_output([self.executable.executable_path, str(self.server.cores)],
cwd=self.project_hdf5.working_directory, shell=False,
stderr=subprocess.STDOUT, universal_newlines=True)
except subprocess.CalledProcessError as e:
if not self.server.accept_crash:
self._logger.warn("Job aborted")
self._logger.warn(e.output)
self.status.aborted = True
error_file = posixpath.join(self.project_hdf5.working_directory, "error.msg")
with open(error_file, "w") as f:
f.write(e.output)
if self.server.run_mode.non_modal:
s.close_connection()
raise RuntimeError("Job aborted")
else:
job_crashed = True
self.status.collect = True
self._logger.info('{}, status: {}, output: {}'.format(self.job_info_str, self.status, out))
self.run()
if job_crashed:
self.status.aborted = True
def run_if_interactive(self):
"""
For jobs which executables are available as Python library, those can also be executed with a library call
instead of calling an external executable. This is usually faster than a single core python job.
"""
raise NotImplementedError("This function needs to be implemented in the specific class.")
def run_if_interactive_non_modal(self):
"""
For jobs which executables are available as Python library, those can also be executed with a library call
instead of calling an external executable. This is usually faster than a single core python job.
"""
raise NotImplementedError("This function needs to be implemented in the specific class.")
def run_if_non_modal(self):
"""
The run if non modal function is called by run to execute the simulation in the background. For this we use
subprocess.Popen()
"""
shell = (os.name == 'nt')
try:
file_name = posixpath.join(self.project_hdf5.working_directory, "run_job.py")
self._logger.info("{}, status: {}, script: {}".format(self.job_info_str, self.status, file_name))
with open(posixpath.join(self.project_hdf5.working_directory, 'out.txt'), mode='w') as f_out:
with open(posixpath.join(self.project_hdf5.working_directory, 'error.txt'), mode='w') as f_err:
self._process = subprocess.Popen(['python', file_name], cwd=self.project_hdf5.working_directory,
shell=shell, stdout=f_out, stderr=f_err, universal_newlines=True)
self._logger.info("{}, status: {}, job submitted".format(self.job_info_str, self.status))
except subprocess.CalledProcessError as e:
self._logger.warn("Job aborted")
self._logger.warn(e.output)
self.status.aborted = True
raise ValueError("run_job.py crashed")
s.logger.info('submitted run %s', self.job_name)
self._logger.info('job status: %s', self.status)
# def run_if_non_modal(self):
# """
# The run if non modal function is called by run to execute the simulation in the background. For this we use
# multiprocessing.Process()
# """
# p = multiprocessing.Process(target=multiprocess_wrapper, args=(self.job_id,
# self.project_hdf5.working_directory,
# False))
# if self.master_id:
# del self
# p.start()
def run_if_manually(self, _manually_print=True):
"""
The run if manually function is called by run if the user decides to execute the simulation manually - this
might be helpful to debug a new job type or test updated executables.
Args:
_manually_print (bool): Print explanation how to run the simulation manually - default=True.
"""
if _manually_print:
print('You have selected to start the job manually. ' +
'To run it, go into the working directory {} and ' +
'call \'python run_job.py\' '.format(posixpath.abspath(self.project_hdf5.working_directory)))
def run_if_scheduler(self, que_wait_for=None):
"""
The run if queue function is called by run if the user decides to submit the job to and queing system. The job
is submitted to the queuing system using subprocess.Popen()
Args:
que_wait_for (int): Job ID the current job should be waiting for before being submitted.
Returns:
int: Returns the queue ID for the job.
"""
queue_options, return_job_id = self.server.init_scheduler_run(working_dir=self.project_hdf5.working_directory,
wait_for_prev_job=que_wait_for,
job_id=self.job_id)
que_id = None
try:
self._logger.debug("SUMBIT SCHEDULED JOB: "+str(queue_options))
p = subprocess.Popen(queue_options, stdout=subprocess.PIPE, universal_newlines=True)
if return_job_id:
self.server.queue_id = p.communicate()[0]
que_id = self.server.queue_id
self._server.to_hdf(self._hdf5)
print('Queue system id: ', que_id)
except subprocess.CalledProcessError as e:
self._logger.warn("Job aborted")
self._logger.warn(e.output)
self.status.aborted = True
raise ValueError("run_queue.sh crashed")
s.logger.debug('submitted %s', self.job_name)
self._logger.debug('job status: %s', self.status)
return que_id
def send_to_database(self):
"""
if the jobs should be store in the external/public database this could be implemented here, but currently it is
just a placeholder.
"""
if self.server.send_to_db:
pass
def update_master(self):
"""
After a job is finished it checks whether it is linked to any metajob - meaning the master ID is pointing to
this jobs job ID. If this is the case and the master job is in status suspended - the child wakes up the master
job, sets the status to refresh and execute run on the master job. During the execution the master job is set to
status refresh. If another child calls update_master, while the master is in refresh the status of the master is
set to busy and if the master is in status busy at the end of the update_master process another update is
triggered.
"""
master_id = self.master_id
project = self.project
self._logger.info("update master: {} {}".format(master_id, self.get_job_id()))
if master_id is not None and not self.server.run_mode.modal and not self.server.run_mode.interactive:
master_db_entry = project.db.get_item_by_id(master_id)
if master_db_entry['status'] == 'suspended':
project.db.item_update({'status': 'refresh'}, master_id)
self._logger.info("run_if_refresh() called")
# p = multiprocessing.Process(target=multiprocess_master, args=(master_id,
# self.project.path,
# self.server.run_mode.thread,
# False))
# del self
# p.start()
del self
master = project.load(master_id)
if master.server.run_mode.non_modal or master.server.run_mode.queue:
master._run_if_refresh()
if master.server.run_mode.queue and master._process:
master._process.communicate()
elif master_db_entry['status'] == 'refresh':
project.db.item_update({'status': 'busy'}, master_id)
self._logger.info("busy master: {} {}".format(master_id, self.get_job_id()))
del self
def job_file_name(self, file_name, cwd=None):
"""
combine the file name file_name with the path of the current working directory
Args:
file_name (str): name of the file
cwd (str): current working directory - this overwrites self.project_hdf5.working_directory - optional
Returns:
str: absolute path to the file in the current working directory
"""
if not cwd:
cwd = self.project_hdf5.working_directory
return posixpath.join(cwd, file_name)
def to_hdf(self, hdf=None, group_name=None):
"""
Store the GenericJob in an HDF5 file
Args:
hdf (ProjectHDFio): HDF5 group object - optional
group_name (str): HDF5 subgroup name - optional
"""
self._executable_activate_mpi()
self._type_to_hdf()
self._server.to_hdf(self._hdf5)
with self._hdf5.open('input') as hdf_input:
hdf_input["restart_file_list"] = self._restart_file_list
hdf_input["restart_file_dict"] = self._restart_file_dict
def from_hdf(self, hdf=None, group_name=None):
"""
Restore the GenericJob from an HDF5 file
Args:
hdf (ProjectHDFio): HDF5 group object - optional
group_name (str): HDF5 subgroup name - optional
"""
self._type_from_hdf()
self._server.from_hdf(self._hdf5)
with self._hdf5.open('input') as hdf_input:
if "restart_file_list" in hdf_input.list_nodes():
self._restart_file_list = hdf_input["restart_file_list"]
if "restart_file_dict" in hdf_input.list_nodes():
self._restart_file_dict = hdf_input["restart_file_dict"]
def save(self):
"""
Save the object, by writing the content to the HDF5 file and storing an entry in the database.
Returns:
(int): Job ID stored in the database
"""
self.to_hdf()
job_id = self.project.db.add_item_dict(self.db_entry())
self._job_id = job_id
self.refresh_job_status()
return job_id
def convergence_check(self):
"""
Validate the convergence of the calculation.
Returns:
(bool): If the calculation is converged
"""
return True
def db_entry(self):
"""
Generate the initial database entry for the current GenericJob
Returns:
(dict): database dictionary {"username", "projectpath", "project", "job", "subjob", "hamversion",
"hamilton", "status", "computer", "timestart", "masterid", "parentid"}
"""
db_dict = {"username": s.login_user,
"projectpath": self.project_hdf5.root_path,
"project": self.project_hdf5.project_path,
"job": self.job_name,
"subjob": self.project_hdf5.h5_path,
"hamversion": self.version,
"hamilton": self.__name__,
"status": self.status.string,
"computer": self._db_server_entry(),
"timestart": datetime.now(),
"masterid": self.master_id,
"parentid": self.parent_id}
return db_dict
def restart(self, snapshot=-1, job_name=None, job_type=None):
"""
Create an restart calculation from the current calculation - in the GenericJob this is the same as create_job().
A restart is only possible after the current job has finished. If you want to run the same job again with
different input parameters use job.run(run_again=True) instead.
Args:
snapshot (int): time step from which to restart the calculation - default=-1 - the last time step
job_name (str): job name of the new calculation - default=<job_name>_restart
job_type (str): job type of the new calculation - default is the same type as the exeisting calculation
Returns:
"""
if not self.job_id:
self._create_job_structure(debug=False)
if job_name is None:
job_name = "{}_restart".format(self.job_name)
if job_type is None:
job_type = self.__name__
if job_type == self.__name__:
new_ham = self.copy_to(new_job_name=job_name, new_database_entry=False)
else:
new_ham = self.create_job(job_type, job_name)
new_ham.parent_id = self.job_id
# ensuring that the new job does not inherit the restart_file_list from the old job
new_ham._restart_file_list = list()
new_ham._restart_file_dict = dict()
return new_ham
def create_job(self, job_type, job_name):
"""
Create one of the following jobs:
- 'ExampleJob': example job just generating random number
- 'SerialMaster': series of jobs run in serial
- 'ParallelMaster': series of jobs run in parallel
- 'ScriptJob': Python script or jupyter notebook job container
- 'ListMaster': list of jobs
Args:
job_type (str): job type can be ['ExampleJob', 'SerialMaster', 'ParallelMaster', 'ScriptJob', 'ListMaster']
job_name (str): name of the job
Returns:
GenericJob: job object depending on the job_type selected
"""
return self.project.create_job(job_type=job_type, job_name=job_name)
def _copy_restart_files(self):
"""
Internal helper function to copy the files required for the restart job.
"""
if not (os.path.isdir(self.working_directory)):
raise ValueError("The working directory is not yet available to copy restart files")
for i, actual_name in enumerate([os.path.basename(f) for f in self._restart_file_list]):
if actual_name in self.restart_file_dict.keys():
new_name = self.restart_file_dict[actual_name]
shutil.copy(self.restart_file_list[i], posixpath.join(self.working_directory, new_name))
else:
shutil.copy(self.restart_file_list[i], self.working_directory)
def _run_manually(self, _manually_print=True):
"""
Internal helper function to run a job manually.
Args:
_manually_print (bool): [True/False] print command for execution - default=True
"""
if _manually_print:
print('You have selected to start the job manually. ' +
'To run it, go into the working directory {} and ' +
'call \'python run_job.py\' '.format(posixpath.abspath(self.project_hdf5.working_directory)))
def _run_if_new(self, debug=False, que_wait_for=None):
"""
Internal helper function the run if new function is called when the job status is 'initialized'. It prepares
the hdf5 file and the corresponding directory structure.
Args:
debug (bool): Debug Mode
que_wait_for (int): Que ID to wait for before this job is executed.
"""
self.validate_ready_to_run()
if self.check_if_job_exists():
print('job exists already and therefore was not created!')
else:
self._create_job_structure(debug=debug)
self.run(que_wait_for=que_wait_for)
def _run_if_created(self, que_wait_for=None):
"""
Internal helper function the run if created function is called when the job status is 'created'. It executes
the simulation, either in modal mode, meaning waiting for the simulation to finish, manually, or submits the
simulation to the que.
Args:
que_wait_for (int): Queue ID to wait for before this job is executed.
Returns:
int: Queue ID - if the job was send to the queue
"""
self.status.submitted = True
# Different run modes
if self.server.run_mode.manual:
self.run_if_manually()
elif self.server.run_mode.modal:
self.run_static()
elif self.server.run_mode.non_modal or self.server.run_mode.thread:
self.run_if_non_modal()
elif self.server.run_mode.queue:
return self.run_if_scheduler(que_wait_for)
elif self.server.run_mode.interactive:
self.run_if_interactive()
elif self.server.run_mode.interactive_non_modal:
self.run_if_interactive_non_modal()
return None
def _run_if_submitted(self): # Submitted jobs are handled by the job wrapper!
"""
Internal helper function the run if submitted function is called when the job status is 'submitted'. It means
the job is waiting in the queue. ToDo: Display a list of the users jobs in the queue.
"""
if self.server.run_mode.queue and self.project.queue_job_info(self) is None:
self.run(run_again=True)
else:
print('Job ' + str(self.job_id) + ' is waiting in the que!')
def _run_if_running(self):
"""
Internal helper function the run if running function is called when the job status is 'running'. It allows the
user to interact with the simulation while it is running.
"""
if self.server.run_mode.queue and self.project.queue_job_info(self) is None:
self.run(run_again=True)
elif self.server.run_mode.interactive:
self.run_if_interactive()
elif self.server.run_mode.interactive_non_modal:
self.run_if_interactive_non_modal()
else:
print('Job ' + str(self.job_id) + ' is running!')
def _run_if_refresh(self):