In [None]:
%matplotlib notebook

import tqdm
import tqdm.notebook
import h5py

import os
import shutil
import multiprocessing
import sys

from contextlib import redirect_stdout
import io

import dateutil
import datetime
import time

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import plotly.express as px

import numpy as np
import random
import strawb

import pandas

import logging
import gc

import threading
import glob

# Define path to store results

In [None]:
path = os.path.join(strawb.Config.proc_data_dir, 'image_cluster_search')
if not os.path.exists(path):
    print(f'Create path: {path}')
    os.makedirs(path, exist_ok=True)

# Logger set-up

In [None]:
formatter_list = ['%(asctime)s',
                  '%(levelname)s',
                  # Text logging level for the message ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL').
                  '%(processName)s', # Process name (if available).
                  '%(threadName)s',  # Thread name (if available).
                  # '%(thread)d',  # Thread ID (if available).
                  #'%(name)s',  # Name of the logger (logging channel).
                  '%(funcName)s',  # Name of function containing the logging call.
                  #'%(lineno)d',  # Source line number where the logging call was issued (if available).
                  '%(message)s'  # The logged message, computed as msg % args.
                  ]

log_level = logging.DEBUG

# add SHUTDOWN level
logging.captureWarnings(True)
logger = logging.getLogger()
logger.setLevel(log_level)

# create file handler which logs to a file. The files are rotated and kept for 10 rotations.
# E.g.: 'W0' once per week on Monday (or Sunday?) a new file is started
file_name = os.path.join(path, 'scan_images.log')
fh = logging.FileHandler(file_name)
fh.setLevel(log_level)

formatter = logging.Formatter(";".join(formatter_list))
fh.setFormatter(formatter)
logger.addHandler(fh)

# Test
for i in ['debug', 'info', 'warning', 'critical']:
    logger.__getattribute__(i)(f'Test level: {i}')

# Load synced files DB

In [None]:
# load DB
db = strawb.SyncDBHandler(file_name='Default')  # loads the db
db.load_onc_db_update(save_db=True)  # update the DB, could take some time if it has to load info. from ONC

# Define functions to scan images

In [None]:
def find_cluster(dataframe, min_n_images=5, mean_std_start=None, mean_std_stop=None, *args, **kwargs):
    logger.debug(f"Find_cluster files: {len(dataframe)}")
    temp_file = strawb.virtual_hdf5.HDF5TempFile(dataframe, module=strawb.Camera)

    if min_n_images <= len(temp_file.module.file_handler.raw):
        print()
        df = temp_file.module.find_cluster.df_all(*args, **kwargs)
    else:
        df = pandas.DataFrame({'label': [-1]})

    # add some parameters to track things
    # df['deviceCode'] = dataframe.deviceCode.iloc[0]
    df['mean_std_n'] = len(temp_file.module.file_handler.raw)

    if mean_std_start is None:
        df['mean_std_start'] = temp_file.module.file_handler.time.asdatetime()[0]
    else:
        df['mean_std_start'] = mean_std_start

    if mean_std_stop is None:
        df['mean_std_stop'] = temp_file.module.file_handler.time.asdatetime()[-1]
    else:
        df['mean_std_stop'] = mean_std_stop

    del temp_file

    gc.get_count(), gc.collect(), gc.get_count()

    return df


def wrap_iter_find_cluster(t_i, dt, dataframe, min_size_cluster=7, *args, **kwargs):
    t_i_end = t_i + dt - pandas.Timedelta('00:00:00.000001')
    mask_time = strawb.tools.pd_timestamp_mask_between(
        dataframe.dateFrom,
        dataframe.dateTo,
        t_i,
        t_i_end,
    )
    logger.info(f"Iter find cluster: {t_i}; {dataframe.deviceCode.iloc[0]}; files: {len(dataframe[mask_time])}")

    if len(dataframe[mask_time]) == 0:
        return pandas.DataFrame(
            {  # 'deviceCode': [dataframe.deviceCode.iloc[0]],
                'mean_std_n': [0],
                'mean_std_start': [t_i],
                'mean_std_stop': [t_i_end]})

    return find_cluster(dataframe[mask_time],
                        min_size_cluster=min_size_cluster,
                        tqdm_kwargs={'desc': str(t_i)},
                        mean_std_start=t_i,
                        mean_std_stop=t_i_end,
                        )


class ClusterWrapper:
    def __init__(self, ):
        self.mpi = None
        self.file_name = None

    def wrap_intervals(self, t_start, dt, dataframe, dt_iter=pandas.offsets.Hour(8), *args, **kwargs):
        """
        PARAMETER
        ---------
        t_start: start time of the iterration
        dt: pandas.offsets, optional
            define the interval for the file
        dt_iter: pandas.offsets, optional
            define the interval of iterrations, Candidates for the interval -> pandas.date_range(..., freq=)
        """
        # define the interval per file
        # Candidates for the interval -> pandas.date_range(..., freq=)
        t_end = t_start + dt

        # normalize=True -> Normalize start/end dates to midnight before generating date range.
        # --> t_end+pandas.offsets.Day(1) add one day to cover the range
        # --> cut the last entries: [dr<t_end]
        dr = pandas.date_range(start=t_start,  # normalize=True - goes to midnight
                               end=t_end,  # normalize=True - goes to midnight
                               freq=dt_iter,
                               normalize=True
                               )
        dr = dr[dr<t_end]
        logger.info(f'dr: {dr.min()} - {dr.max() - pandas.Timedelta("00:00:00.000001")}')

        # gen filename
        str_formater = '{dev_code}_{t_start:%Y%m%dT%H%M%S}_{t_end:%Y%m%dT%H%M%S}_image_cluster.gz'
        formater_dict = {'dev_code': dataframe.deviceCode.iloc[0],
                         't_start': t_start,
                         't_end': t_end}
        
        global path
        self.file_name = str_formater.format(**formater_dict)
        self.file_name = os.path.join(path, self.file_name)

        if os.path.exists(self.file_name):
            logger.info(f'Skipp as DB exists: {self.file_name}')
            return

        logger.info(f'Iter range: {t_start:%Y%m%dT%H%M%S}-{t_end:%Y%m%dT%H%M%S}')

        # do cluster search multiprocessing
        run_kwargs = {'dt': dt_iter, 'dataframe': dataframe}
        pbar_kwargs = {'desc': f'{dataframe.deviceCode.iloc[0]} - {t_start:%Y%m%dT%H%M%S}'}
        self.mpi = strawb.MProcessIterator(processes=4, progress_bar=tqdm.notebook.tqdm, with_sys_log=False)
        self.mpi.run(wrap_iter_find_cluster, dr, pbar_kwargs=pbar_kwargs, **run_kwargs)

        logger.info(f'Done iter range: {t_start:%Y%m%dT%H%M%S}-{t_end:%Y%m%dT%H%M%S}')

        # collect result
        image_cluster_db = strawb.sync_db_handler.ImageClusterDB(file_name=self.file_name,
                                                                 load_db=False)

        success_dict = self.mpi.success_dict.copy()
        for i in success_dict:
            if image_cluster_db.dataframe is None:
                image_cluster_db.dataframe = self.mpi.result_dict.pop(i)
            else:
                image_cluster_db.dataframe = image_cluster_db.dataframe.append(
                    self.mpi.result_dict.pop(i),
                    ignore_index=True)

        if 'time' in image_cluster_db.dataframe.keys():
            image_cluster_db.dataframe.time = pandas.to_datetime(image_cluster_db.dataframe.time, utc=True)

        logger.info(f'Save DB: {self.file_name}')
        image_cluster_db.save_db()
        logger.info(f'Done save DB')
        logger.info(f'self.mpi.error_dict: {self.mpi.error_dict}')
        image_cluster_db.dataframe = None
        del image_cluster_db

# Start the cluster search for all modules and periods

In [None]:
cluster_w_list = []

def iter_all(dataframe, dt_month=pandas.offsets.MonthBegin(1)):
    mask = dataframe.dataProductCode == 'MSSCD'
    mask &= dataframe.file_version > 0

    for dev_i in dataframe.deviceCode[mask].unique():
        logger.info(f'---- Start {dev_i} ----')
        #         if dev_i == 'TUMPMTSPECTROMETER001':
        #             continue

        mask_i = mask & (dataframe.deviceCode == dev_i)
        dataframe_i = dataframe[mask_i]

        t_start = dataframe_i.dateFrom.min()
        t_end = dataframe_i.dateFrom.max()

        dr_month = pandas.date_range(start=t_start - dt_month,  # normalize=True - goes to midnight
                                     end=t_end,  # normalize=True - goes to midnight
                                     freq=dt_month,
                                     normalize=True
                                     )

        for dr_i in tqdm.notebook.tqdm(dr_month, desc=dev_i):
            cluster_w_list.append(ClusterWrapper())
            print()
            cluster_w_list[-1].wrap_intervals(dr_i, dt_month, dataframe_i)
            
cluster_thread = threading.Thread(target=iter_all, 
                                  kwargs=dict(dataframe=db.dataframe))
cluster_thread.start()

# Monitor the progress

In [None]:
# print state of last item
mpi = cluster_w_list[-1].mpi

# Get the list of iterrations and if the multiprocessing is still active or not
# Only the last item should be active
print('Runs: ', [i.mpi.active for i in cluster_w_list if i.mpi is not None])
print()

if mpi is not None:
    print('_active_jobs_dict_: ', mpi._active_jobs_dict_.keys())
    print('_ready_dict_: ', mpi._ready_dict_.keys())
    print('result_dict: ', mpi.result_dict.keys())
    print('error_dict: ', mpi.error_dict.keys())
    print('success_dict: ', mpi.success_dict.keys())