In [1]:
# Importing modules from the pyHTC package

from pyHTC.Study import *
import pyHTC.toolkit as toolkit

# Working directory preparation
---

In [2]:
# Preparation of the working directory

toolkit.prepare_work_dir()

# Study Creation
---

In [3]:
# Study creation 
# This is basically an instantiation of the class 'Study'

myStudy = StudyObj(name='times2')  # All attributes are filled by default, i'm just giving it a name

In [4]:
# To see ALL the attributes, use the 'describe' method

myStudy.describe()  # note the fancy alphabetical order ;) 

{'arguments': '$(input_file)',
 'error_dir': 'error/',
 'executable': 'exe.sh',
 'input_dir': '',
 'job_flavour': 'espresso',
 'log_dir': 'log/',
 'name': 'times2',
 'output_dir': 'output/',
 'path': '/afs/cern.ch/user/a/apoyet/public/pyHTC/the_simplest_example',
 'queue': '',
 'submit_file': 'muSubmit.sub',
 'universe': 'vanilla'}


In [5]:
# Or if you like, you can get those attributes in a df

my_attributes_df = myStudy.describe(in_df=True)
my_attributes_df

Unnamed: 0,0
name,times2
path,/afs/cern.ch/user/a/apoyet/public/pyHTC/the_si...
executable,exe.sh
submit_file,muSubmit.sub
input_dir,
arguments,$(input_file)
output_dir,output/
error_dir,error/
log_dir,log/
job_flavour,espresso


# Input Creation and Study Definition
---

In [6]:
# A simple DataFrame

col = ['number']
data = np.random.randn(10)
df = pd.DataFrame(data, columns=col)
index = [myStudy.name+'_'+str(i) for i in df['number'].values]
df.index = index
df.head()

Unnamed: 0,number
times2_-1.1534706811380708,-1.153471
times2_-1.260748318879834,-1.260748
times2_-2.0828844876255115,-2.082884
times2_1.4339443056519938,1.433944
times2_-0.19536975431814083,-0.19537


In [7]:
# Let's define the study as a series of jobs

myStudy.define_study(df)

Unnamed: 0,number
times2_-1.1534706811380708,-1.153471
times2_-1.260748318879834,-1.260748
times2_-2.0828844876255115,-2.082884
times2_1.4339443056519938,1.433944
times2_-0.19536975431814083,-0.19537
times2_0.8538564724080824,0.853856
times2_-1.2972846991297584,-1.297285
times2_-0.3856101256838024,-0.38561
times2_-0.865281013859263,-0.865281
times2_0.1587961641614595,0.158796


In [8]:
# This df is now an attribute of the study called DF

myStudy.DF

Unnamed: 0,number
times2_-1.1534706811380708,-1.153471
times2_-1.260748318879834,-1.260748
times2_-2.0828844876255115,-2.082884
times2_1.4339443056519938,1.433944
times2_-0.19536975431814083,-0.19537
times2_0.8538564724080824,0.853856
times2_-1.2972846991297584,-1.297285
times2_-0.3856101256838024,-0.38561
times2_-0.865281013859263,-0.865281
times2_0.1587961641614595,0.158796


In [19]:
# The idea is to submit a function that will take as argument this df and multiply by two the 'number' column
# Such a function is written in a python script called times2.py in the working directory
# We therefore store the DF as pickle in the input folder
# WARNING: correct the link in the script ;) 

myStudy.DF.to_pickle(os.getcwd()+'/input/input_0.pkl')

In [20]:
# In this case there's only one python script to run... 
# Even if it doesn't make so much sense, we need to respect the nomenclature of the interface
# Our python script is therefore copied into the input folder under the name studyname_0.in

!cp times2.py input/times2_0.in

# Submission
---

In [30]:
# One needs a submit file to do so 

myStudy.submit2file(myStudy.submit2str())

In [31]:
# One can display the submission file

myStudy.display_subfile()

executable = exe.sh
arguments = $(input_file)
output = output/times2.$(ClusterId).$(ProcId).out
error = error/times2.$(ClusterId).$(ProcId).err
log = log/times2.$(ClusterId).log
universe = vanilla
+JobFlavour = "espresso"
queue input_file matching files /afs/cern.ch/user/a/apoyet/public/pyHTC/the_simplest_example/input/times2_*.in


In [32]:
# And...... SUBMISSION

myStudy.submit2HTCondor()

Submitting job(s).
1 job(s) submitted to cluster 3694560.



In [24]:
# Monitor

myStudy.condor_q()



-- Schedd: bigbird16.cern.ch : <188.184.90.62:9618?... @ 08/08/19 15:26:18
OWNER  BATCH_NAME     SUBMITTED   DONE   RUN    IDLE  TOTAL JOB_IDS
apoyet CMD: exe.sh   8/8  15:26      _      _      1      1 3694536.0

1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended



# Read the output
---

In [33]:
output = pd.read_pickle('output/output_0.pkl')

In [34]:
output

Unnamed: 0,number,result
times2_-1.1534706811380708,-1.153471,-2.306941
times2_-1.260748318879834,-1.260748,-2.521497
times2_-2.0828844876255115,-2.082884,-4.165769
times2_1.4339443056519938,1.433944,2.867889
times2_-0.19536975431814083,-0.19537,-0.39074
times2_0.8538564724080824,0.853856,1.707713
times2_-1.2972846991297584,-1.297285,-2.594569
times2_-0.3856101256838024,-0.38561,-0.77122
times2_-0.865281013859263,-0.865281,-1.730562
times2_0.1587961641614595,0.158796,0.317592
