In [None]:
import os
import re
from pathlib import Path
from collections import Counter

import matplotlib.pyplot as plt
import sklearn

In [None]:
datadir = Path('~/OneDrive/Documents/data/washer').expanduser()
rawfolder = datadir.joinpath("Raw Data (CoolTerm)")

In [None]:
os.listdir(rawfolder)

In [None]:
fnames = {
    'light': '1_Light Cycle Capture (1k to 500Ω) 2022-10-11 10-51-33.txt',
    'lightdryer': '3_Light Cycle+Dryer Capture 2022-10-11 12-40-55.txt',
    'dryer': '2_Dryer Capture 2022-10-11 11-57-47.txt',
}

In [None]:
raw = {}
for alias, fname in fnames.items():
    with open(rawfolder.joinpath(fname), 'rb') as f:
        raw[alias] = f.read()


In [None]:
{
    alias: len(data)
    for alias, data in raw.items()
}

In [None]:
light = raw['light']

In [None]:
decoded = {
    alias: data.decode('utf-8')
    for alias, data in raw.items()
}

In [None]:
asbytes = {
    alias: bytearray.fromhex(data)
    for alias, data in decoded.items()
}

In [None]:
str_ = {
    alias: ''.join(map(chr, data))
    for alias, data in asbytes.items()
}

In [None]:
len(asbytes['light'])

In [None]:
asbytes['light'][:20]

In [None]:
decoded['light'][:60]

In [None]:
counts = {
    alias: Counter(data)
    for alias, data in asbytes.items()
}

In [None]:
df_counts = pd.DataFrame.from_dict(counts)

In [None]:
df_counts.sum(axis=1).sort_values(ascending=False)

In [None]:
bytes([241])

In [None]:
b'\x'

In [None]:
'\xff'

In [None]:
import sklearn.feature_extraction.text


In [None]:
import sys
sys.byteorder

In [None]:
hex(int.from_bytes(bytes.fromhex('e6'), sys.byteorder))[2:]

In [None]:
bytes.fromhex('e6').hex()

In [None]:
bytes.hex

In [None]:
def tokenizer(seq):
    return [hex(b) for b in seq]

In [None]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    encoding='ascii',
    tokenizer=tokenizer,
    #token_pattern='.',
    ngram_range=(1, 4),
)

In [None]:
vectorized = vectorizer.fit_transform(asbytes.values())

In [None]:
vectorized

In [None]:
pd.Series(vectorized.sum(axis=0).A1).hist()
plt.yscale('log')

In [None]:
vectorized.sum(axis=0).reshape((-1)).A1.shape

In [None]:
import binascii

In [None]:
asbytes['light'][:10]

In [None]:
binascii.hexlify(asbytes['light'][:10])

In [None]:
# write bytes to files 
files = []
for alias, data in asbytes.items():
    path = rawfolder.joinpath('{}.raw'.format(alias))
    files.append(path)
    with open(path, 'wb') as f:
        f.write(data)

In [None]:
# write bytes to files 
files2 = []
for alias, data in str_.items():
    path = rawfolder.joinpath('{}.raw.utf'.format(alias))
    files2.append(path)
    with open(path, 'w', encoding='utf-8') as f:
        f.write(data)

## Tokenizer approach

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer


In [None]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [None]:
trainer = BpeTrainer(
    special_tokens=["[UNK]",],
#     min_frequency=3,
    vocab_size=500,
)

In [None]:
# tokenizer.train([str(rawfolder.joinpath(fname)) for fname in fnames.values()], trainer)

In [None]:
tokenizer.train([str(path) for path in files2], trainer)

In [None]:
tokenizer.get_vocab_size()

In [None]:
tokenizer.get_vocab()

In [None]:
[[hex(ord(char)) for char in vocab] for vocab in tokenizer.get_vocab()]

In [None]:
tokenizer.save(str(rawfolder.joinpath('tokenizer.json')))

In [None]:
xx = tokenizer.encode(str_['light'])

In [None]:
len(xx)

In [None]:
xx

In [None]:
xx.tokens[:20]

In [None]:
import pandas as pd

In [None]:
pd.Series(xx.tokens).value_counts()