In [None]:
# for developers
%load_ext autoreload
%autoreload 2

In [None]:
# import hdf5plugin # required to access LZ4-encoded HDF5 data sets, if not on your global path
import matplotlib.pyplot as plt
from diffractem import version, proc2d, pre_proc_opts, io, tools
from diffractem.dataset import Dataset
from tifffile import imread
import numpy as np
from dask.distributed import Client, LocalCluster, TimeoutError
import os
import pandas as pd

%matplotlib widget

opts = pre_proc_opts.PreProcOpts('preproc.yaml')
# opts.im_exc = 'indexamajig'
cfver = !{opts.im_exc} -v
print(f'Running on diffractem:', version())
print(f'Running on', cfver[0])
print(f'Current path is:', os.getcwd())

pxmask=imread(opts.pxmask)
reference=imread(opts.reference)

# Processing of dose-fractionated data sets
...which is ideally done after once running the full workflow including indexing, or even merging.
Essentially, it starts by creating processed data files, which works very simiarly to `preprocessing.ipynb`.
In this script, we will prepare a dataset with aggregated frames 0+1+2, as well as files containing all frames separately, or a cumulative sum of them.

Then, `.stream` files with integrated intensities are derived from the already knwon indexing solutions -- similarly to the re-integration as explained in `indexing.ipynb`.

After running this notebook, you will have new stream files - one with a different aggregation, one with all single shots, and one with all different aggregations (cumulated single shots).

In [None]:
cluster_port = 8786

try:
    client = Client(address=f'127.0.0.1:{cluster_port}', timeout='2s')
    print('Running cluster scheduler found and connected.')
    client.run(os.chdir, os.getcwd()); # change the cluster to the current directory
except (OSError, TimeoutError):
    print('Seems no cluster scheduler is running. Starting one.')
    cluster = LocalCluster(host=f'127.0.0.1:{cluster_port}', n_workers=20, threads_per_worker=2, 
                       local_directory='/scratch/distributed')
    client = Client(address=f'127.0.0.1:{cluster_port}')

client

## Load the raw data set
We start, just as in `preprocessing.ipynb`, by loading the raw data set.

In [None]:
opts.load() # re-load parameters from the .yaml file

raw_files = io.expand_files('raw_data/*.nxs', validate=True)
print(f'Found {len(raw_files)} raw files. Have fun pre-processing!')
ds = Dataset.from_files(raw_files, chunking=50, )
ds.merge_meta('/%/instrument/detector/collection/shutter_time')
ds

## Broadcasting to another aggregation

We now make an aggregation where we include the first frame (`frame==0`), which we omitted from the original aggregation we used for indexing etc., and only include the first three frames.

### Preparing data files
...works excatly as for the original aggregation, just that instead of `compute_pattern_info`, we use `merge_pattern_info`, in order to get the pattern information (peaks, center,...) from the `image_info.h5`.

From there, it works exactly the same again: do your hit correction, compute the final image using `proc2d.correct_image`, check the outcome using `view`, and compute and save it.
After indexing, you can use the newly made files for integration just as well - see `indexing.ipynb`.

In [None]:
# now, e.g. make another aggregation...
ds_0to2 = ds.aggregate(query='frame >= 0 and frame <= 2 and shutter_time == 2', 
                      by=['sample', 'region', 'run', 'crystal_id'], how='sum', 
                       new_folder='proc_data', file_suffix='_0to2.h5')
ds_0to2.merge_pattern_info('image_info.h5')
ds_0to2_hit = ds_0to2.get_selection(f'num_peaks > {opts.min_peaks}', file_suffix='_hit.h5')

In [None]:
# ...and correct the images
opts.load()
ds_compute = ds_0to2_hit
img_final = proc2d.correct_image(ds_compute.raw_counts, opts,
                                ds_compute.shots.lor_x.values,
                                ds_compute.shots.lor_y.values,
                                ds_compute.peak_data) # keep in mind, that this a lazy computation, so nothing is actually done yet

ds_compute.add_stack('corrected', img_final, overwrite=True, set_diff_stack=True)
ds_compute.view()

Now, `ds_0to2_hit` has everything to be written to disk!

In [None]:
ds_0to2_hit.compute_and_save(diff_stack_label='corrected', list_file='hits_0to2.lst', exclude_stacks='raw_counts',
                            client=client, overwrite=True)

### Running the integration
...is virtually identical to the (Re-)Integration step in `indexing.ipynb`.
Just make a `.sol` file for your new data set and fire up `indexamajig --indexing=file`.

In [None]:
dsname = 'hits_0to2'
# ds_0to2_hit = Dataset.from_files(dsname + '.lst', open_stacks=False)
ds_0to2_hit.get_indexing_solution('master.stream', sol_file=dsname + '.sol')

In [None]:
# %mkdir streams
copy_fields = ['sample', 'region', 'crystal_id', 'run', 
               'adf1', 'adf2', 'lor_hwhm', 'center_x', 'center_y']
copy_fields = [f'/%/shots/{cf}' for cf in copy_fields]

opts.load()
cfcall = tools.call_indexamajig(f'{dsname}.lst', 'refined.geom', 
                                output=f'streams/{dsname}.stream', 
                                cell='refined.cell', 
                                im_params=opts.integration_params, 
                                procs=40, exc='/opts/crystfel_latest/bin/indexamajig',
                                fromfile_input_file = f'{dsname}.sol',
                                copy_fields=copy_fields)

print('--- RUN THIS ---------------')
print(cfcall)

## Broadcasting to single shots

(Advanced)

Now, we want to make a corrected and annotated (i.e., including peaks and centers) version of the raw data, i.e., single movie frames, for example to study radiation damage or be flexible during merging.
This is done essentially exactly the same as if you were just using a different aggregation (see above), just that instead of `Dataset.aggregate` you just use `Dataset.get_selection` to restrict the range of included frames.

In [None]:
# now, do exactly the same thing as above, but on single-shot data
unchunk = False # IMPORTANT: set to True to look at the set with .view(), otherwise set to False

ds_sgl = ds.get_selection('frame >= 0 and frame < 10 and shutter_time==2', file_suffix='_allframe.h5', new_folder='proc_data')

ds_sgl.merge_pattern_info('image_info.h5')
ds_sgl = ds_sgl.get_selection(f'num_peaks > {opts.min_peaks}', file_suffix='_hit.h5')

if unchunk:
    ds_sgl.rechunk_stacks(1)

opts.load()
ds_compute = ds_sgl
img_final = proc2d.correct_image(ds_compute.raw_counts, opts,
                                ds_compute.shots.lor_x.values,
                                ds_compute.shots.lor_y.values,
                                ds_compute.peak_data) # keep in mind, that this a lazy computation, so nothing is actually done yet

ds_compute.add_stack('corrected', img_final, overwrite=True, set_diff_stack=True)

In [None]:
# ...and run the computation
dsname = 'hits-allframe'
ds_compute.compute_and_save(diff_stack_label='corrected', list_file=f'{dsname}.lst', exclude_stacks='raw_counts',
                            client=client, overwrite=True)

In [None]:
# Finally: integrate Bragg spot intensities, grabbing the indexing
# solutions from master.stream
# IMPORTANT - 'frame' now has to be in the copy_fields, which allows to
# later determine which dose fractionation frame a stream chunk belongs to.
dsname = 'hits-allframe'
ds_sgl.get_indexing_solution('master.stream', sol_file=dsname + '.sol')

# IMPORTANT: NOW 'frame' HAS TO BE IN!
copy_fields = ['frame','sample', 'region', 'crystal_id', 'run', 
               'adf1', 'adf2', 'lor_hwhm', 'center_x', 'center_y']
copy_fields = [f'/%/shots/{cf}' for cf in copy_fields]

opts.load()
cfcall = tools.call_indexamajig(f'{dsname}.lst', 'refined.geom', 
                                output=f'streams/{dsname}.stream', 
                                cell='refined.cell', 
                                im_params=opts.integration_params, 
                                procs=40, exc='/opts/crystfel_latest/bin/indexamajig',
                                fromfile_input_file = f'{dsname}.sol',
                                copy_fields=copy_fields)

print('--- RUN THIS ---------------')
print(cfcall)

## Make cumulative-sum files
(Advanced)

Finally, you can also create a set of files, which instead of single frames, have their cumulative sums, which means that you can pick in hindsight which ones you want to use for the later steps. 
Some might prefer a workflow where you just make files for different aggregations (as above) that you think make sense.

Anyway - for this case, the function `transform_stack_group` does exactly what you want: a cumulative sum over each group in your stack matching one unique crystal. The rest is as usual.

In [None]:
# only if restarting from here... re-load the single-shot set
ds_sgl = Dataset.from_files('hits-allframe.lst', chunking=20)

In [None]:
# copy data set and apply transform function, which defaults to cumulation
ds_cum_0 = ds_sgl.get_selection('True', file_suffix='_cum_from_0.h5')
ds_cum_0.transform_stack_groups(stacks='corrected')

In [None]:
# run the computation. Depending on your computer and data set size, have a coffee or go to bed now.
dsname = 'hits_cum-0'
ds_cum_0.compute_and_save(diff_stack_label='corrected', list_file=f'{dsname}.lst', exclude_stacks='raw_counts',
                            client=client, overwrite=True)

In [None]:
# Finally: integrate Bragg spot intensities, grabbing the indexing
# solutions from master.stream
# IMPORTANT - 'frame' now has to be in the copy_fields, which allows to
# later determine which dose fractionation frame a stream chunk belongs to.

dsname = 'hits_cum-0'
ds_cum_0.get_indexing_solution('master.stream', sol_file=dsname + '.sol')

# IMPORTANT: NOW 'frame' HAS TO BE IN!
copy_fields = ['frame','sample', 'region', 'crystal_id', 'run', 
               'adf1', 'adf2', 'lor_hwhm', 'center_x', 'center_y']
copy_fields = [f'/%/shots/{cf}' for cf in copy_fields]

opts.load()
cfcall = tools.call_indexamajig(f'{dsname}.lst', 'refined.geom', 
                                output=f'streams/{dsname}.stream', 
                                cell='refined.cell', 
                                im_params=opts.integration_params, 
                                procs=40, exc='/opts/crystfel_latest/bin/indexamajig',
                                fromfile_input_file = f'{dsname}.sol',
                                copy_fields=copy_fields)

print('--- RUN THIS ---------------')
print(cfcall)