In [None]:
# %matplotlib notebook
import tqdm
import tqdm.notebook
import h5py

import os

import dateutil
import datetime
import time

import numpy as np
import strawb

import pandas
import gc
import glob

In [None]:
force_update = False

# define root path of image cluster data
path = os.path.join(strawb.Config.proc_data_dir, 'image_cluster_search')
path

# Merge DBs

## Search for monthly DBs

In [None]:
file_dict = {'fullPath': [], 'deviceCode': [], 'dateFrom':[], 'dateTo':[], 'filesize':[]}
search_str = os.path.join(path, '*_image_cluster.gz')

for i in glob.glob(search_str):
    file_name = os.path.basename(i)
    dev_i, d_f, d_t, _ = file_name.split('_', 3)
    
    file_dict['deviceCode'].append(dev_i)
    file_dict['dateFrom'].append(d_f)
    file_dict['dateTo'].append(d_t)
    
    file_dict['fullPath'].append(os.path.abspath(i))
    file_dict['filesize'].append(os.path.getsize(i))
    
    
df = pandas.DataFrame(file_dict)
del file_dict
df.dateFrom = pandas.to_datetime(df.dateFrom, unit='ns')
df.dateTo = pandas.to_datetime(df.dateTo, unit='ns')

df.sort_values('dateFrom', inplace=True)

df

## Compress monthly DB and generate new DB for each device

In [None]:
# gen filename

min_n_pixel = 15
str_formater = '{dev_code}_{t_start:%Y%m%dT%H%M%S}_{t_end:%Y%m%dT%H%M%S}_image_cluster_merge_npixel{min_n_pixel}.gz'

for dev_i in df.deviceCode.unique():
    df_i = df[df.deviceCode==dev_i]
    formater_dict = {'dev_code': dev_i,
                     't_start': df_i.dateFrom.min(),
                     't_end': df_i.dateTo.max(),
                     'min_n_pixel': min_n_pixel}
    file_name = str_formater.format(**formater_dict)
    file_name = os.path.join(path, file_name)

    if os.path.exists(file_name) and not force_update:
        print(f'File Exists - skip it - {file_name}')
        continue
    print(file_name)
    
    db_merg = strawb.sync_db_handler.ImageClusterDB(file_name=file_name, load_db=False)
    
    for f_j in tqdm.notebook.tqdm(df_i.fullPath, desc=dev_i):
        db_merg_j = strawb.sync_db_handler.ImageClusterDB(file_name=f_j)
        
        # mask by size, but take the label 0 to get at least one entry for every picture
        mask_df = (db_merg_j.dataframe.n_pixel >= min_n_pixel) | (db_merg_j.dataframe.label<=0)
        dataframe_j = db_merg_j.dataframe[db_merg_j.dataframe.n_pixel >= min_n_pixel].copy()
        
        print(os.path.basename(f_j), f'{len(dataframe_j)/len(db_merg_j.dataframe):.3f}', len(db_merg_j.dataframe), len(dataframe_j))
        
        dataframe_j.sort_values('time', inplace=True)
        dataframe_j.time = pandas.to_datetime(dataframe_j.time, utc=True)
        dataframe_j.mean_std_start = pandas.to_datetime(dataframe_j.mean_std_start, utc=True)
        dataframe_j.mean_std_stop = pandas.to_datetime(dataframe_j.mean_std_stop, utc=True)
        dataframe_j.sort_values(['mean_std_start', 'time', 'label'], axis=0, inplace=True, ignore_index=True)  # , 'time', 'label']


        if db_merg.dataframe is None:
            db_merg.dataframe = dataframe_j
        else:
            db_merg.dataframe = db_merg.dataframe.append(dataframe_j, ignore_index=True)

    db_merg.save_db()

    print(strawb.tools.human_size(os.path.getsize(file_name)))