In [None]:
%matplotlib notebook
import numpy as np
import pandas as pd

import strawb
import os
import matplotlib.pyplot as plt

import scipy.ndimage
import random
import scipy.spatial
import cv2
import scipy.interpolate

# Load files from the ONC server
Be careful, depending on the amount of data this can take a while!!

In [None]:
# load DB
db = strawb.SyncDBHandler(file_name='Default')  # loads the db
db.load_onc_db_update(save_db=True)  # update the DB, could take some time if it has to load info. from ONC

Print some info from the DB

In [None]:
print(db.dataframe.columns)

### these are the available device codes
print(db.dataframe.deviceCode.unique())

### different measurement types for PMTSPEC and LIDAR, 
# works only if hdf5 attributes are imported from files on disc
#print(db.dataframe.measurement_type.unique())

### these are the parts of each module that produce data
print(db.dataframe.dataProductCode.unique())

### Select (a) file(s) of interest

In [None]:
mask = (db.dataframe.deviceCode == 'TUMPMTSPECTROMETER001') # that's the pmtspec module
mask &= (db.dataframe.dataProductCode =='MSSCD') # that's the camera data of the pmtspec module

## select the file for the biolumi event with a window of +- 5 hours
# timestamp = pd.Timestamp('2022-03-04T23:44:09', tz='UTC')  # gain = 30
timestamp = pd.Timestamp('2021-09-04T23:44:09', tz='UTC')  # gain = 50
mask &= db.dataframe.dateFrom >= timestamp - pd.Timedelta('5H')  # - 5 hours
mask &= db.dataframe.dateFrom <= timestamp + pd.Timedelta('5H')  # - 5 hours

### selected one file from the DB (it's the same as the file we selected above by hand)
db.dataframe[mask]

### Download the missing files which aren't synced so far from `db.dataframe[mask]`

In [None]:
if not db.dataframe.synced[mask].all():
    db.update_db_and_load_files(
        db.dataframe[mask],
        output=True,  # print output to console
        download=True,  # download the files
        save_db=True,
    )  # update the DB

# Import the file to the Camera Module 

In [None]:
# select the Camera file(s) -> dataProductCode == 'MSSCD'
item = db.dataframe[mask & (db.dataframe.dataProductCode =='MSSCD')]

try: # if the pmtspec file is still open
    camera.file_handler.close()
except:
    pass
    
# generate a virtual hdf5 to combine the datasets if there are multiple files selected
if len(item) > 1:
    vhdf5 = strawb.VirtualHDF5('MSSCD_event_view.hdf5', item.fullPath.to_list())  
    file_name = vhdf5.file_name
else:
    file_name = item.fullPath[0]

# create an instance of the Camera
camera = strawb.Camera(file_name)

### Print some parameters

In [None]:
print(f'Module: {camera.file_handler.module}')
print(f'Number of Frames: {camera.file_handler.exposure_time.shape[0]}')
print(f'Date: {np.min(camera.file_handler.time.asdatetime()[:])} - {np.max(camera.file_handler.time.asdatetime()[:])}')
print(f'Exposure Times [s]: {np.unique(camera.file_handler.exposure_time)}')

### Mask images to export (here only one)

In [None]:
# may take some time, if the time period isn't changed, index 170 is the bright event
if False:
    # mask over a threshold + mask invalid frames + mask no lucifer enabled
    mask = (camera.images.integrated_minus_dark > 1e6) & camera.images.invalid_mask

    index = np.argsort(camera.images.integrated_minus_dark)  # sort by charge [min,...,max]
    index = index[mask[index]]  # remove invalid items  & cam_module.invalid_mask
    index = index[::-1]  # revers the order
else:
    index = [170]
print(index)

## Show one image here

In [None]:
plt.figure()
rgb = camera.images.load_rgb(index=index)
plt.imshow(rgb[0,:,:]/2**16)  # rgb[frame, row, col], /255 to get 0->1
#plt.savefig("figures/biolumi_demo.pdf", backend="pdf")
#plt.savefig("figures/biolumi_demo.png", dpi=120)

## How to access the raw data (numpy)

In [None]:
# The raw pixel values are NOT loaded by default to the module to save RAM.
# They can be accessed directly from the file with the index, index = None (default) to loads all images
a = camera.file_handler.raw[[1,3]]  # direct h5py access, allows only sorted (non-duplicate) index access.
print(a.shape)

a = camera.file_handler.raw.getunsorted([1,3])  # STRAWb helper to access it unsorted (and duplicate) by index
print(a.shape)

raw = camera.file_handler.raw[:]  # get all images
# returns array of images on default, even if only one element is accessed
print(f'raw shape: {raw.shape}') # shape of images, n_pic x 2D shape of picture
print(f'raw picture: {raw[0].shape}') # 2D shape of picture

raw = camera.images.cut2effective_pixel_arr(raw)
print(f'shape reduced to effective pixel: {raw[0].shape}') # 2D shape of picture

# Now we have the raw-data with valid pixel range 

In [None]:
# plot the histogram for some pixel
bins = np.linspace(0, 2**16, 1000)
plt.figure()
for i in range(1,10,2):
    # pixel are selected with: 1200-50*i & 900-50*i
    plt.hist(raw[:, 1200-50*i, 900-50*i], bins=bins, histtype='step')

plt.yscale('log')
plt.show()

# Load or generate ImageClusterDB

In [None]:
if not os.path.exists(strawb.sync_db_handler.ImageClusterDB._default_file_name_):
    image_cluster_db = strawb.sync_db_handler.ImageClusterDB(load_db=False)
    image_cluster_db.dataframe = camera.find_cluster.df_all()
    image_cluster_db.save_db()
else:
    image_cluster_db = strawb.sync_db_handler.ImageClusterDB()
    
image_cluster_db.dataframe

### Get a DataFrame without label 0 and add charge, charge_log

In [None]:
df = image_cluster_db.dataframe[image_cluster_db.dataframe.label!=0]

df = df_2[df_2.label!=0]
df.loc[:,'charge'] = (df.charge_with_noise - df.noise).to_numpy()
df.loc[:,'charge_log'] = np.log(df.charge_with_noise - df.noise)

### Hist of cluster sizes

In [None]:
parameter = df.n_pixel
# get similar size bins in log space
bins = np.unique(np.geomspace(parameter.min(), parameter.max()*1.1, 100).astype(int))

# # linear bins
# bins = np.arange(int(df.n_pixel.min()), int(df.n_pixel.max()*1.1), 1)

count, edges = np.histogram(parameter, bins=bins,)

plt.figure()
StepPatch = plt.stairs(count, edges, fill=True)
plt.yscale('log')
plt.xscale('log')
plt.xlabel('Cluster Size [pixel]')
plt.ylabel('Counts')
plt.grid()
StepPatch.zorder=5
plt.tight_layout()

## Show images with detected cluster bigger than a threshold

In [None]:
# get pictures filtered by a parameter and a limit

# limit = 2e5
# parameter = 'charge'

limit = 20  # limit from plot
parameter = 'n_pixel'

file_times = camera.file_handler.time.asdatetime()[:]
indexes = [np.argwhere(file_times==i )[0,0] for i in df[df[parameter]  > limit].time]
indexes = np.unique(indexes)

index_df_dict = {i: df[df.time==file_times[i]] for i in indexes}
index_df_dict.keys()

In [None]:
for i, df_i in index_df_dict.items():
    plt.figure()
    rgb = camera.images.load_rgb(index=i)
    plt.imshow(rgb[0,:,:]/2**16)  # rgb[frame, row, col], /255 to get 0->1
    for j, df_j in df_i[df_i[parameter] > limit].iterrows():
        plt.plot(strawb.tools.connect_polar(df_j.box_corners_y),
                 strawb.tools.connect_polar(df_j.box_corners_x),
                 color='w', alpha=.5, label='Min. Box'
                 )
        plt.plot(*df_j.center_of_mass[::-1],
                 'o', color='w', alpha=.75,
                 label='Center of Mass',
                 )
        plt.plot(*df_j.center_of_pix[::-1],
                 'x', color='w', alpha=.75,
                 label='Center of Pix',
                 )
        plt.plot(*df_j.box_center_x_y[::-1],
                 '>', color='w', alpha=.75,
                 label='Center of Box',
                 )
        
    # don't show double labels
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())

# ML filtering

In [None]:
# Set up a plotting function for fast df insights
def stair_scatter(df, color=None, size=1, alpha=.1, ax=None, columns=None, log=True, **kwargs):
    if columns is None:
        columns = df.columns
    rows = columns[1:] 
    cols = columns[:-1]
    
    # ax[row, col]
    if ax is None:
        fig, ax = plt.subplots(nrows=len(rows), 
                       ncols=len(cols), 
                       sharex='col', sharey='row', 
                       squeeze=False, 
                       **kwargs)
        
        for j, cols_j in enumerate(cols):
            if log:
                ax[-1,j].set_xscale('log')
            ax[-1,j].set_xlabel(cols_j.replace('_',' '), rotation=0) #rotation=70
        for i, row_i in enumerate(rows):
            ax[i, 0].set_ylabel(row_i.replace('_',' '), rotation=90) #rotation=30)
            if log:
                ax[i, 0].set_yscale('log')
        for i, row_i in enumerate(rows):
            for j, cols_j in enumerate(cols):
                if j<=i:
                    ax[i, j].grid()
                else:
                    ax[i,j].axis('off')

    for i, row_i in enumerate(rows):
        for j, cols_j in enumerate(cols):
            if j<=i:#cols_j != row_i:
    #             ax[i, j].text(0.5, 0.5, f'x:{cols_j}\ny:{row_i}', 
    #                           horizontalalignment='center',
    #                           verticalalignment='center', 
    #                           transform=ax[i, j].transAxes)
                ax[i, j].scatter(df[cols_j], df[row_i], s=size, c=color, alpha=alpha)
        
    return ax

In [None]:
import sklearn.cluster

# define a container for a ML-Model
class ClusterModel:
    def __init__(self, model, df, columns, n=1000, name='', *kwargs):
        self.name = name
        # define the model
        self.model = model

        # fit the model
        X = sklearn.preprocessing.StandardScaler().fit_transform(df[columns].to_numpy())
        random_int = np.random.randint(0, len(X), n)  # select n out of X
        self.model.fit(X[random_int])

        # cal the all peaks
        self.labels = self.model.predict(X)
        
        # sort the labels by count
        clusters, clusters_counts = np.unique(self.labels, return_counts=True)
        self.clusters = clusters[np.argsort(-clusters_counts)]
        self.clusters_counts = clusters_counts[np.argsort(-clusters_counts)]

        self.labels_cs = np.zeros_like(self.labels, dtype=int) - 1
        for i, c_i in enumerate(self.clusters):
            self.labels_cs[self.labels == c_i] = i
            
        self.df = df

    def plot_level_hist(self, ax=None, norm_x=True, norm_y=True):
        if ax is None:
            plt.figure()
            ax = plt.gca()
            ax.set_yscale('log')
            ax.set_xlabel('Classification Typ')
            ax.set_ylabel('Count')
            
        if norm_x:
            class_typ = np.linspace(0, 1,len(self.clusters_counts))
        else:
            class_typ = np.arange(0,len(self.clusters_counts)+1, 1)
            
        if norm_y:
            norm_y_s = self.clusters_counts.max()
        else:
            norm_y_s = 1
            
        ax.plot(class_typ, self.clusters_counts/norm_y_s, label=f'{self.name.replace("_","-")} {len(self.clusters)}')
#         plt.hist(self.labels_cs, bins=len(self.clusters_counts))
        return ax

In [None]:
# Train model
columns = ['n_pixel', 'charge']
columns_log = ['n_pixel', 'charge_log']

n = 20000
n_clusters = 4
k_means_4 = ClusterModel(sklearn.cluster.KMeans(n_clusters=n_clusters), df, columns, name='KMeans_4', n=n)
# k_means_4_log = ClusterModel(sklearn.cluster.KMeans(n_clusters=n_clusters), df, columns_log, name='KMeans_4_log', n=n)

n_clusters = 2
k_means_2 = ClusterModel(sklearn.cluster.KMeans(n_clusters=n_clusters), df, columns, name='KMeans_2', n=n)
# k_means_2_log = ClusterModel(sklearn.cluster.KMeans(n_clusters=n_clusters), df, columns_log, name='KMeans_2_log', n=n)

# n = 2000
# aff_pro = ClusterModel(sklearn.cluster.AffinityPropagation(damping=.7), df, columns, name='AffinityPropagation')
# aff_pro_log = ClusterModel(sklearn.cluster.AffinityPropagation(damping=.7), df, columns_log, name='AffinityPropagation_log')

# n = 20000
# mean_shift = ClusterModel(sklearn.cluster.MeanShift(n_jobs=-1), df, columns, name='MeanShift')
# mean_shift_log = ClusterModel(sklearn.cluster.MeanShift(n_jobs=-1), df, columns_log, name='MeanShift_log')



In [None]:
algorithms = [k_means_2,#k_means_2_log,
              k_means_4, #k_means_4_log,
#               aff_pro, aff_pro_log,
#               mean_shift, mean_shift_log
             ]


ax = None
for i, alg_i in enumerate(algorithms):
    ax = alg_i.plot_level_hist(ax)
    
plt.legend()
plt.grid()
plt.tight_layout()

In [None]:
columns = [#'plateau_sizes', 'left_thresholds', 'right_thresholds'
           'n_pixel', 'charge', #'charge_with_noise', 'noise', 
           'mean_absolute_deviation_per_pixel', 'mean_deviation_per_pixel_in_sigma']

alg_i=k_means_4
stair_scatter(alg_i.df, 
              color=alg_i.labels_cs,
              size=1+10*alg_i.labels_cs,
              columns=columns,
              alpha=1,
              figsize=(9,9))
    

plt.tight_layout()

In [None]:
alg_i=k_means_4
# Sort the labels by counts
labels, counts = np.unique(alg_i.labels_cs, return_counts=True)

# Take half of the labels and show cluster
df[alg_i.labels_cs>labels[np.argsort(counts)][len(labels)//2]]