In [1]:
%matplotlib inline

Sparse Sampling
==============

**Suhas Somnath**

4/27/2018

Introduction
-------------

Example scientific problem
---------------------------
For this example, we will be working with a Band Excitation Piezoresponse Force Microscopy (BE-PFM) imaging dataset
acquired from advanced atomic force microscopes. In this dataset, a spectra was collected for each position in a two
dimensional grid of spatial locations. Thus, this is a three dimensional dataset that has been flattened to a two
dimensional matrix in accordance with the pycroscopy data format.

In [1]:
# Ensure python 3 compatibility:
from __future__ import division, print_function, absolute_import, unicode_literals

# The package for accessing files in directories, etc.:
import os

# Warning package in case something goes wrong
from warnings import warn

# Package for downloading online files:
try:
    # This package is not part of anaconda and may need to be installed.
    import wget
except ImportError:
    warn('wget not found.  Will install with pip.')
    import pip
    pip.main(['install', 'wget'])
    import wget

# The mathematical computation package:
import numpy as np

# The package used for creating and manipulating HDF5 files:
import h5py

# Packages for plotting:
import matplotlib.pyplot as plt

# Parallel computation library:
try:
    import joblib
except ImportError:
    warn('joblib not found.  Will install with pip.')
    import pip
    pip.main(['install', 'joblib'])
    import joblib

# Timing
import time

# A handy python utility that allows us to preconfigure parts of a function
from functools import partial

# Function for reading the number of CPU cores on this computer:
from multiprocessing import cpu_count

# Finally import pycroscopy for certain scientific analysis:
if True:
    import sys
    sys.path.append(os.path.split(os.path.abspath('.'))[0])
    import pycroscopy as px
else:
    try:
        import pycroscopy as px
    except ImportError:
        warn('pycroscopy not found.  Will install with pip.')
        import pip
        pip.main(['install', 'pycroscopy'])
        import pycroscopy as px

  from ._conv import register_converters as _register_converters
  warn('You are using the unity_dev branch, which is aimed at a 1.0 release for pycroscopy. '


Load the dataset
--------------------
In order to demonstrate parallel computing, we will be using a real experimental dataset that is available on the pycroscopy GitHub project. First, lets download this file from Github:

In [3]:
# download the raw data file from Github:
h5_path = 'temp.h5'
url = 'https://raw.githubusercontent.com/pycroscopy/pycroscopy/master/data/BELine_0004.h5'
if os.path.exists(h5_path):
    os.remove(h5_path)
_ = wget.download(url, h5_path)

Now, lets open this HDF5 file and see its contents:

In [4]:
# Open the file in read-only mode
h5_file = h5py.File(h5_path, mode='r+')
px.hdf_utils.print_tree(h5_file)

/
├ Measurement_000
  ---------------
  ├ Channel_000
    -----------
    ├ Bin_FFT
    ├ Bin_Frequencies
    ├ Bin_Indices
    ├ Bin_Step
    ├ Bin_Wfm_Type
    ├ Excitation_Waveform
    ├ Noise_Floor
    ├ Position_Indices
    ├ Position_Values
    ├ Raw_Data
    ├ Spatially_Averaged_Plot_Group_000
      ---------------------------------
      ├ Bin_Frequencies
      ├ Mean_Spectrogram
      ├ Spectroscopic_Parameter
      ├ Step_Averaged_Response
    ├ Spectroscopic_Indices
    ├ Spectroscopic_Values
    ├ UDVS
    ├ UDVS_Indices


The focus of this example is not on the data storage or arrangement but rather on demonstrating parallel computation so lets dive straight into the main dataset that requires fitting of the spectra:

In [31]:
# Get handles to the the raw data along with other datasets and datagroups that contain necessary parameters
h5_meas_grp = h5_file['Measurement_000']

# Accessing the dataset of interest:
h5_main = px.PycroDataset(h5_meas_grp['Channel_000/Raw_Data'])
px.hdf_utils.write_simple_attrs(h5_main, {'quantity':'Cantilever Vertical Deflection',
                                          'units': 'V'})

print('\nThe main dataset:\n------------------------------------')
print(h5_main)

num_rows, num_cols = h5_main.pos_dim_sizes


The main dataset:
------------------------------------
<HDF5 dataset "Raw_Data": shape (16384, 119), type "<c8">
located at: 
/Measurement_000/Channel_000/Raw_Data 
Data contains: 
Cantilever Vertical Deflection (V) 
Data dimensions and original shape: 
Position Dimensions: 
X - size: 128 
Y - size: 128 
Spectroscopic Dimensions: 
Frequency - size: 119


In [None]:
import random

In [7]:
fract = 0.25 
chosen_pos = random.sample(range(h5_main.shape[0]), int(fract * h5_main.shape[0]))

In [46]:
np.unique(chosen_pos).size == len(chosen_pos)

True

In [37]:
indices = np.tile(np.arange(len(chosen_pos)), (2, 1)).T
indices.shape

(4096, 2)

In [20]:
values = h5_main.h5_pos_vals[()][chosen_pos]
values.shape

(4096, 2)

In [21]:
data = h5_main[()][chosen_pos]

In [22]:
h5_sparse_group = px.hdf_utils.create_results_group(h5_main, 'Sparse_Sampling')

In [23]:
h5_pos_inds_sparse = h5_sparse_group.create_dataset('Position_Indices', data=indices, 
                                                    dtype=px.write_utils.INDICES_DTYPE)
_ = px.hdf_utils.copy_attributes(h5_main.h5_pos_inds, h5_pos_inds_sparse)

In [27]:
h5_pos_vals_sparse = h5_sparse_group.create_dataset('Position_Values', data=values, 
                                                    dtype=px.write_utils.VALUES_DTYPE)
_ = px.hdf_utils.copy_attributes(h5_main.h5_pos_vals, h5_pos_vals_sparse)

In [33]:
h5_sparse_main = px.hdf_utils.write_main_dataset(h5_sparse_group, data, 'Sparse_Data', 
                                                 px.hdf_utils.get_attr(h5_main, 'quantity'), 
                                                 px.hdf_utils.get_attr(h5_main, 'units'), 
                                                 None, None,
                                                 h5_pos_inds=h5_pos_inds_sparse,
                                                 h5_pos_vals=h5_pos_vals_sparse,
                                                 h5_spec_inds=h5_main.h5_spec_inds,
                                                 h5_spec_vals=h5_main.h5_spec_vals)

print(h5_sparse_main)

<HDF5 dataset "Sparse_Data": shape (4096, 119), type "<c8">
located at: 
/Measurement_000/Channel_000/Raw_Data-Sparse_Sampling_000/Sparse_Data 
Data contains: 
Cantilever Vertical Deflection (V) 
Data dimensions and original shape: 
Position Dimensions: 
X - size: 4096 
Y - size: 4096 
Spectroscopic Dimensions: 
Frequency - size: 119


In [34]:
px.hdf_utils.print_tree(h5_meas_grp)

/Measurement_000
├ Channel_000
  -----------
  ├ Bin_FFT
  ├ Bin_Frequencies
  ├ Bin_Indices
  ├ Bin_Step
  ├ Bin_Wfm_Type
  ├ Excitation_Waveform
  ├ Noise_Floor
  ├ Position_Indices
  ├ Position_Values
  ├ Raw_Data
  ├ Raw_Data-Sparse_Sampling_000
    ----------------------------
    ├ Position_Indices
    ├ Position_Values
    ├ Sparse_Data
  ├ Spatially_Averaged_Plot_Group_000
    ---------------------------------
    ├ Bin_Frequencies
    ├ Mean_Spectrogram
    ├ Spectroscopic_Parameter
    ├ Step_Averaged_Response
  ├ Spectroscopic_Indices
  ├ Spectroscopic_Values
  ├ UDVS
  ├ UDVS_Indices


In [17]:
h5_file.close()
os.remove(h5_path)