In [None]:
# Run this notebook with access to the `strawb.Config.pandas_file_sync_db` file
# i.e. on a LRZ vm with mounted `dss`
# or via downloading `/dss/strawb/raw_module_data/pandas_file_sync_db.gz` to your local machine
# >>>  rsync --progress -au straw-lrz-vm:"/dss/strawb/raw_module_data/pandas_file_sync_db.gz" .
# and adopted 'pandas.read_pickle(strawb.Config.pandas_file_sync_db)' accordingly

import strawb

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

import pandas

import numpy as np
import datetime

# set plotly default to plotly_white and define a presonal colorway
pio.templates["myname"] = go.layout.Template(
    layout=go.Layout(
        colorway=px.colors.qualitative.T10,  # colors
        yaxis = dict(exponentformat = 'SI'),  # exponentformat to SI, i.e., 1e9 -> 1G
    ))
pio.templates.default = "plotly_white+myname"  # 'plotly_white' and overwrite with the pramters in 'myname'

# Load or update the DB

In [None]:
### check that the paths are correct
print(strawb.Config.pandas_file_sync_db)
print(strawb.Config.raw_data_dir)

In [None]:
# load the DB from disc (if it exists) with the path from the config file
db = strawb.SyncDBHandler(file_name='Default')

# if the DB doesn't exist on dics, load it.
# Else, if the latest entrie is older than a day, load the missing days. 
# Else, do nothing.
db.load_onc_db_update(save_db=True)  # and save it on disc too.

# show synced files
db.dataframe#[db.dataframe.synced]

In [None]:
codes = []

def get_data_product(data_product_code: str):
    if data_product_code is not None:
        return db.onc_downloader.getDataProducts({"dataProductCode": i})
    
data_product_code_arr = db.dataframe.dataProductCode.unique()
data_product_code_arr = data_product_code_arr[data_product_code_arr != None]

sjt = strawb.tools.ShareJobThreads(thread_n=4, unit='DataProduct')
sjt.do(get_data_product, data_product_code_arr)
# for i in db.dataframe.dataProductCode.unique():
#     if i is not None:
#         print(i)
#         codes.append([db.onc_downloader.getDataProducts({"dataProductCode": i})])

In [None]:
i = []
i in [list(), dict()]

In [None]:
sjt.return_buffer

## Extract some performance parameters

In [None]:
mask_lf = db.dataframe.dataProductCode != 'LF'  # don't count log files 'LF'
total_size = db.dataframe.uncompressedFileSize[mask_lf].sum()

print('STRAWb created since the deployment (01.10.2020)')
print(f'Total number of files: {mask_lf.sum()}')
print(f'Total data size:       {strawb.tools.human_size(total_size)}')

## add deviceName for cleaner labeling

In [None]:
# add a new column to simplify the plotting
db.dataframe['deviceName'] = db.dataframe['deviceCode'].str.replace('TUM','').str.replace('00',' ')

## File size by deviceCode

In [None]:
# Mask data
# mask = [pd_sync['fileSize'] > 5e8],  # mask by filesSize 
mask = db.dataframe['dataProductCode']=='LIDARTOT'  # only a specific 'dataProductCode'
# mask = pd_sync['dataProductCode']!=''  # all data

fig_1 = px.scatter(db.dataframe[mask],
                   x='dateFrom', y='fileSize',
                   color='deviceName',
                   hover_data=['filename', 'dataProductCode'],
                   opacity=.5,
                  )
fig_1.show()

## File size by dataProductCode for the LiDAR001 and LIDAR002 only

remove `[pd_sync['deviceCode'] == 'TUMLIDAR001']` for to show all modules

In [None]:
# Mask
mask = db.dataframe['deviceCode'] == 'TUMLIDAR001'  # files from the LIDAR1 only
mask |= db.dataframe['deviceCode'] == 'TUMLIDAR002'  # and add files from the LIDAR2

fig_2 = px.scatter(db.dataframe[mask],
                   x='dateFrom', y='fileSize',
                   color='dataProductCode',
                   hover_data=['filename', 'deviceCode'],
                   opacity=.5,
                  )
fig_2.show()

## Explore files with hdf5 meta data, aka. hdf5 attributes

In [None]:
# Mask
mask = db.get_mask_h5_attrs()

fig_2 = px.scatter(db.dataframe[mask],
                   x='dateFrom', y='fileSize',
                   color='dataProductCode',
                   hover_data=['filename', 'deviceCode', 'measurement_type'],
                   opacity=.5,
                  )
fig_2.show()

In [None]:
# Mask
mask = db.get_mask_h5_attrs()

fig_2 = px.scatter(db.dataframe[mask],
                   x='dateFrom', y='fileSize',
                   color='deviceCode',
                   hover_data=['filename', 'dataProductCode', 'measurement_type'],
                   opacity=.5,
                  )
fig_2.show()

## Total File Size per Day and Module

In [None]:
# cal. pins, each day one bin
bins = pandas.date_range(start=db.dataframe['dateFrom'].min(),
                         end=db.dataframe['dateFrom'].max() #+ datetime.timedelta(days=1)
                        ).to_numpy('datetime64')

# cal. for each module the total file size per day (bin)
size_per_day = {}
for dev_i in db.dataframe['deviceName'].unique():
    pd_dev_i = db.dataframe[db.dataframe['deviceName']==dev_i]
    
    # exclude log file, as this is data not produced by the module rather than the ONC daq system
    pd_dev_i = pd_dev_i[pd_dev_i['dataProductCode'] != 'LF']
    hist, bin_edges = np.histogram(pd_dev_i['dateFrom'].to_numpy('datetime64'),
                                   bins=bins,
                                   weights=pd_dev_i['uncompressedFileSize'])
    
    size_per_day[dev_i] = hist

# convert it to a DataFrame
size_per_day = pandas.DataFrame(size_per_day)
size_per_day.index = bins[:-1]  # set day as index of DataFrame

In [None]:
# create a scatter plot, for the trendlines see https://plotly.com/python/linear-fits/
fig = px.scatter(size_per_day,
                 # trendline="lowess", trendline_options=dict(frac=0.1),
                 trendline='rolling', trendline_options=dict(window=7),
                 #marginal_y="violin",
                 #log_y=True,
                 #opacity=.5,
                 labels={
                     "index": "Date",
                     "value": "File Size per Day (Byte)",
                     "variable": "Module",
                 },
                 range_y=(size_per_day[size_per_day!=0].min().min()*.9,
                          size_per_day[size_per_day!=0].max().max()*1.1),
                )
fig.show()

In [None]:
# create a line plot
fig = px.line(size_per_day, y=size_per_day.keys(), log_y=True, #range_x=[300e3, ]
              #markers=True,
              range_y=(49e3, #size_per_day[size_per_day!=0].min().min()*.9,
                       size_per_day[size_per_day!=0].max().max()*1.1),  # s_numpy[s_numpy>0].min()*.9
              #line_shape="spline",  # 'linear' or "spline"
              labels={
                  "index": "Date",
                  "value": "File Size per Day (Byte)",
                  "variable": "Module",
              },
              render_mode="svg")

# fig.update_layout({'xaxis': {'linecolor': 'rgb(36,36,36)',
#                              'showline': True,
#                              'ticks': 'outside'},
#                    'yaxis': {'linecolor': 'rgb(36,36,36)',
#                              'showline': True,
#                              'ticks': 'outside'}
#                   })


fig.show()

In [None]:
mask = db.dataframe.dataProductCode.isnull()
db.dataframe.loc[mask, 'dataProductCode'] = 'None'

mask = db.dataframe.filename.str.endswith('hdf5')

px.histogram(db.dataframe[mask],
             log_x=True,
             x='fileSize', 
             color='dataProductCode',
             range_x=(1,1e5),
             marginal="rug"
            )

In [None]:
## Check if invalid (empty) files can be detected from the file size - TODO
# mask = db.dataframe.dataProductCode.isnull()
# db.dataframe.loc[mask, 'dataProductCode'] = 'None'  # otherwise it's Nan and it could give problems

mask = db.dataframe.filename.str.endswith('hdf5')

# workaroung for hist with log_x for plotly
bins = np.geomspace(db.dataframe.uncompressedFileSize[mask].min(),
                    db.dataframe.uncompressedFileSize[mask].max(),
                    1000)
hist_dict={}
for i in db.dataframe.dataProductCode[mask].unique():
    mask_i = db.dataframe.dataProductCode == i
    un_file_size = db.dataframe.uncompressedFileSize[mask_i]
    hist_i, bin_edges = np.histogram(un_file_size, bins=bins)
    hist_dict.update({i :np.ma.masked_equal(hist_i,0)})
hist_dict.update({'bin_low': bin_edges[:-1], 'bin_high': bin_edges[1:]})

px.scatter(data_frame=hist_dict,
           x='bin_low',
           y=db.dataframe.dataProductCode[mask].unique(),
           hover_data=['bin_high'],
           log_x=True)