# Analyse ADXL345 Datasets:

- Data source: https://studio.edgeimpulse.com/public/36985/latest/acquisition/training
- Scenarios: `idle, left_right, up_down, circle`

In [33]:
import sys
sys.path.append("C:/Users/pmdsc/IdeaProjects/SirPopper/paper_reviews_collaborations/")
from preamble import *
import importlib
_ = importlib.reload(sys.modules['preamble'] )

In [44]:
def compile_impulse_dataset_from_json(glob_path='ADXL_GestureDataset/*.json'):
    dfr = pd.DataFrame()
    for seq, fn in enumerate(glob.glob(glob_path)):
        jdf = pd.read_json(fn)
        interval_ms = jdf['payload'].loc['interval_ms']
        columns = jdf['payload'].loc['sensors']
        values = jdf['payload'].loc['values']
        dft = pd.DataFrame(values, columns=[f"{c['name']}_{c['units']}" for c in columns])
        dft['meta_seq'] = seq
        dft['meta_interval_ms'] = interval_ms
        dft['target'] = fn.split('\\')[-1].split('.')[0]
        dft = dft.reset_index().rename(columns={'index':'meta_seq_num'})
        dft = dft.reindex(sorted(dft.columns), axis=1)
        dfr = pd.concat( [ dfr, dft ], ignore_index=True )
    return dfr    



## Train Set:

In [55]:
dfr = compile_impulse_dataset_from_json(glob_path='ADXL_GestureDataset/train/*.json')
display(dfr)
dfr.describe()

Unnamed: 0,accX_m/s2,accY_m/s2,accZ_m/s2,meta_interval_ms,meta_seq,meta_seq_num,target
0,-0.23,-1.25,10.63,16.666667,0,0,circle
1,-0.42,-1.26,10.60,16.666667,0,1,circle
2,-0.51,-1.26,10.53,16.666667,0,2,circle
3,-0.59,-1.23,10.53,16.666667,0,3,circle
4,-0.66,-1.25,10.42,16.666667,0,4,circle
...,...,...,...,...,...,...,...
9493,0.40,-0.02,5.62,16.666667,15,589,up_down
9494,0.64,-0.26,6.10,16.666667,15,590,up_down
9495,1.04,-0.41,6.74,16.666667,15,591,up_down
9496,1.20,-0.91,7.75,16.666667,15,592,up_down


Unnamed: 0,accX_m/s2,accY_m/s2,accZ_m/s2,meta_interval_ms,meta_seq,meta_seq_num
count,9498.0,9498.0,9498.0,9498.0,9498.0,9498.0
mean,0.254055,-1.012433,10.525372,16.66667,7.500948,296.312697
std,2.899874,1.542622,2.616199,6.395221e-14,4.609375,171.373897
min,-11.75,-6.97,0.23,16.66667,0.0,0.0
25%,-0.88,-1.82,9.72,16.66667,4.0,148.0
50%,0.06,-0.53,10.67,16.66667,7.5,296.0
75%,1.03,0.09,11.18,16.66667,11.75,445.0
max,12.12,3.72,25.49,16.66667,15.0,593.0


#### Inspect Statistics by Class Type:
- Min / Max is approx 12 to -12.
- Means and Std are different among axes (x,y,z) and classes.

> ##### Finding:
> - `Requires numeric scaling` for inference and training.
> - The `same scaling parameters` are needed for both inference and training (e.g. min/max for normalization, or mean/std for standardization).
> - For this dataset, Min=-12 , Max=12.
> - Therefore, any inference data (from a new sensor device) must be scaled (calibrated) to the same rate.

- Mean 16.6 per interval_ms 
- Count `594, 593` time series entries per sequence.
- Approx `594*16.6 = 9860` milliseconds per sequence set (data file).

> ##### Finding: 
> Limited length `sliding windows` can be used in several ways. They have:
> - `window_size`: `n=2000/16.6=120`
> - `stride_length`: 
> 
> Prediction Data Modeling Options:
> - `as raw batch input` (classifier model: `n=120`) -> Outcome
> - `as statistically summarized batch input` (preprocessed features -> classifier model: `n=features`) -> Outcome
> - `as time series` (time series classifier model: `n=3x120`)  -> Outcome
> - `as noise removed time series` (noise removal -> time series classifier model: `n=3x120`)  -> Outcome

In [61]:
for t in dfr['target'].unique():
    print(t)
    display(dfr[dfr['target']==t][['accX_m/s2','accY_m/s2', 'accZ_m/s2', 'meta_seq_num', 'meta_interval_ms']].describe())

circle


Unnamed: 0,accX_m/s2,accY_m/s2,accZ_m/s2,meta_seq_num,meta_interval_ms
count,594.0,594.0,594.0,594.0,594.0
mean,0.982407,-0.825842,10.524949,296.5,16.66667
std,2.364399,1.978397,2.988256,171.617307,3.555708e-15
min,-3.95,-4.67,5.14,0.0,16.66667
25%,-1.0675,-2.6775,8.005,148.25,16.66667
50%,0.895,-0.335,10.315,296.5,16.66667
75%,3.1,0.93,13.5875,444.75,16.66667
max,5.97,2.18,16.38,593.0,16.66667


idle


Unnamed: 0,accX_m/s2,accY_m/s2,accZ_m/s2,meta_seq_num,meta_interval_ms
count,593.0,593.0,593.0,593.0,593.0
mean,0.212816,-0.212867,10.718094,296.0,16.66667
std,0.042191,0.019483,0.018091,171.328632,3.555713e-15
min,0.04,-0.31,10.64,0.0,16.66667
25%,0.19,-0.22,10.71,148.0,16.66667
50%,0.21,-0.21,10.72,296.0,16.66667
75%,0.24,-0.2,10.73,444.0,16.66667
max,0.4,-0.13,10.78,592.0,16.66667


left_right


Unnamed: 0,accX_m/s2,accY_m/s2,accZ_m/s2,meta_seq_num,meta_interval_ms
count,594.0,594.0,594.0,594.0,594.0
mean,0.523754,-0.648973,10.704983,296.5,16.66667
std,2.456826,0.547261,0.787344,171.617307,3.555708e-15
min,-5.31,-2.61,7.59,0.0,16.66667
25%,-1.1275,-0.9975,10.19,148.25,16.66667
50%,0.365,-0.595,10.66,296.5,16.66667
75%,2.25,-0.2725,11.18,444.75,16.66667
max,8.55,0.81,13.81,593.0,16.66667


up_down


Unnamed: 0,accX_m/s2,accY_m/s2,accZ_m/s2,meta_seq_num,meta_interval_ms
count,594.0,594.0,594.0,594.0,594.0
mean,0.662643,-1.13899,10.611566,296.5,16.66667
std,0.901166,2.022458,4.468448,171.617307,3.555708e-15
min,-2.09,-5.32,2.76,0.0,16.66667
25%,0.0525,-2.8475,6.775,148.25,16.66667
50%,0.6,-1.14,10.54,296.5,16.66667
75%,1.1975,0.4375,13.9125,444.75,16.66667
max,4.21,2.86,20.79,593.0,16.66667


## Test Set:

In [78]:
dfr = compile_impulse_dataset_from_json(glob_path='ADXL_GestureDataset/test/*.json')
display(dfr)
dfr.describe()

Unnamed: 0,accX_m/s2,accY_m/s2,accZ_m/s2,meta_interval_ms,meta_seq,meta_seq_num,target
0,-0.11,0.91,10.12,16.666667,0,0,circle
1,-0.23,0.93,10.38,16.666667,0,1,circle
2,-0.22,0.96,10.52,16.666667,0,2,circle
3,-0.15,1.04,10.39,16.666667,0,3,circle
4,0.02,1.08,10.33,16.666667,0,4,circle
...,...,...,...,...,...,...,...
2370,0.54,-2.73,12.30,16.666667,3,589,up_down
2371,0.11,-2.39,11.80,16.666667,3,590,up_down
2372,-0.45,-2.04,11.23,16.666667,3,591,up_down
2373,-0.72,-1.76,10.95,16.666667,3,592,up_down


Unnamed: 0,accX_m/s2,accY_m/s2,accZ_m/s2,meta_interval_ms,meta_seq,meta_seq_num
count,2375.0,2375.0,2375.0,2375.0,2375.0,2375.0
mean,0.595566,-0.706876,10.639865,16.66667,1.500211,296.375158
std,1.784347,1.47864,2.716452,1.421385e-14,1.118458,171.436987
min,-5.31,-5.32,2.76,16.66667,0.0,0.0
25%,-0.105,-1.275,9.895,16.66667,0.5,148.0
50%,0.24,-0.24,10.71,16.66667,2.0,296.0
75%,1.345,-0.16,11.26,16.66667,2.5,445.0
max,8.55,2.86,20.79,16.66667,3.0,593.0


In [42]:
from sklearn.preprocessing import StandardScaler
X = dfr[['accX_m/s2','accY_m/s2', 'accZ_m/s2']].to_numpy()
y = dfr['target'].to_numpy()
# X_encoded = UtilStats.Preprocess.one_hot_encode(X, ['col'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled, y

(array([[-0.16693141, -0.1540101 ,  0.0399946 ],
        [-0.23245495, -0.16049291,  0.02852698],
        [-0.26349241, -0.16049291,  0.0017692 ],
        ...,
        [ 0.27104173,  0.39054602, -1.44697345],
        [ 0.32621945,  0.06640547, -1.06089691],
        [ 0.15378908, -0.38090849, -0.60219212]]),
 array(['circle', 'circle', 'circle', ..., 'up_down', 'up_down', 'up_down'],
       dtype=object))

## Deployment / Model Export:
- Choosing Quantized `int8` models means that floating points are converted to 8-bit integers, in range -128 to 127, containing `n=256`-unique discrete values compared to the high-precision floating points (n=`(2^8-1)*2^24-1 = 4,278,190,079` [ref](https://stackoverflow.com/a/7744178)).
    - An example with this data on using a `NN=360>20>10>4`  Model illustrates the performance (model accuracy and inference time / memory) difference:

<table>
<tr><td>
<img src="attachment:88adef0a-b346-4fa4-ba6c-afd28bf89e7f.png" width="80%">
</td><td>
<img src="attachment:eb32432e-286c-460f-8e18-ea6dddf531bf.png" width="80%">
</tr>
<td>Exporting as Default C++</td>
<td>Exporting as Arduino Library</td>
</tr>
</table>

![image.png](attachment:2fc6386e-c33e-4bf8-b7d8-9a938c089241.png)
