Simple RNN trained on the dataset tokenized, no token between calls, only at the beginning of a new trace.


In [341]:
! [ -e /content ] && pip install -Uqq fastbook
import fastbook

fastbook.setup_book()

In [342]:
import re
import json

from fastbook import *
from fastai.text.all import *

# dtype = torch.float
# device = torch.device("mps")

# Prepare data for model

In [343]:
# ensure proper ordering of the batch
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds

def get_dls(ds_div, bs, sl):
    with open('../extract/data/model_input_med.txt', 'r') as f:
        text = f.read()

    text = '[UNK] ' + text
    text = text[:int(len(text)/ds_div)]

    tokens = text.strip().split(' ')
    vocab = L(*tokens).unique()

    print(f'text length: {len(text):,d}')
    print(f'tokens length: {len(tokens):,d}')
    print(f'vocab length: {len(vocab):,d}')

    word2idx = {w:i for i,w in enumerate(vocab)}
    nums = L(word2idx[i] for i in tokens)
    
    seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+sl:i+sl+sl]))
             for i in range(0,len(nums)-sl-1,sl))
    m = len(seqs)//bs
    m,bs,len(seqs)
    cut = int(len(seqs) * 0.8)
    dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                                 group_chunks(seqs[cut:], bs),
                                 bs=bs, drop_last=True, shuffle=False)
    return dls, vocab

class LMModel(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

    def forward(self, x):
        raw,h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(out),raw,out
    
    def reset(self): 
        for h in self.h: h.zero_()

# Data model

In [344]:
emb_sz = 128
n_layers = 2
wd = 0.1
n_epoch = 2
lr = 1e-2
bs = 8
ds_div = 8

sl = 16

inputs = []
preds = []
targs = []
cnt = 0
# def my_accuracy(inp, targ, axis=-1):
#     "Compute accuracy with `targ` when `pred` is bs * n_classes"
# #     print('Inp:', inp)
#     if len(inputs) < 10:
#         inputs.append(inp)
#         targs.append(targ)
#     pred,targ = flatten_check(inp.argmax(dim=axis), targ)
#     if len(preds) < 10:
#         preds.append(pred)
# #     print('Pred: ', pred)
# #     print('Targ: ', targ)
#     return (pred == targ).float().mean()

dls, vocab = get_dls(ds_div, bs, sl)

model = torch.compile(LMModel(len(vocab), emb_sz, n_layers, 0.4))
print(f'ds_div: {ds_div}, bs: {bs}, emb_sz: {emb_sz}, n_layers: {n_layers}, sl: {sl}, lr: {lr}')
learn = TextLearner(dls, model,
                    loss_func=CrossEntropyLossFlat(), metrics=[accuracy])
learn.fit_one_cycle(n_epoch, lr, wd=wd)

text length: 47,324,779
tokens length: 1,541,629
vocab length: 17,351
ds_div: 8, bs: 8, emb_sz: 128, n_layers: 2, sl: 16, lr: 0.01


epoch,train_loss,valid_loss,accuracy,my_accuracy,time
0,2.182962,3.600276,0.599596,0.599596,03:53
1,2.300854,1.567296,0.809434,0.809434,03:56


In [70]:
from pathlib import Path

evmxo_path = Path('.')
model_path = Path('serve')
vocab_path = Path('index_to_names')
state_dict_path = Path('model-store')

vocab_len = len(vocab)
model_name = f'evmxo_vocab{vocab_len}_bs{bs}_sl{sl}_emb{emb_sz}'

state_dict_filename = f'state_dict_{model_name}.pt'
torch.save(learn.model.state_dict(), evmxo_path/state_dict_path/state_dict_filename)

vocab_dict = {k: v for k, v in enumerate(vocab)}
vocab_filename = f'index_to_name_{model_name}.json'
with open(evmxo_path/vocab_path/vocab_filename, 'w') as f:
    json.dump(vocab_dict, f, indent=2)

model_filename = f'model_{model_name}.py'
with open(evmxo_path/model_path/model_filename, 'w') as f:
    model_code = f'''
import base_model

batch_size = {bs}
embeddings_size = {emb_sz}
vocab_length = {vocab_len}

class LMModel_{model_name}(base_model.LMModel):
    def __init__(self):
        super(LMModel_{model_name}, self).__init__(vocab_length, embeddings_size, 2, batch_size)
'''
    f.write(model_code)

print(f'saved model: {model_name}')

saved model: evmxo_vocab17351_bs8_sl16_emb128


In [98]:
! cp {evmxo_path}/{vocab_path}/{vocab_filename} {evmxo_path}/{vocab_path}/index_to_name.json
! torch-model-archiver --model-name {model_name} --model-file {evmxo_path}/{model_path}/{model_filename} \
--serialized-file {evmxo_path}/{state_dict_path}/{state_dict_filename} \
--extra-files {evmxo_path}/{model_path}/evmxHandler.py,{evmxo_path}/{model_path}/base_model.py,{evmxo_path}/{vocab_path}/index_to_name.json \
--handler {evmxo_path}/serve/handler.py -v 1.0 \
--export-path {evmxo_path}/model-store -f



In [None]:
# Training to the max on different batch sizes

# still slightly increasing after 8 epoch
# ds_div: 4, bs: 64, emb_sz: 128, n_layers: 2, sl: 64
# epoch	train_loss	valid_loss	accuracy	time
# 0	3.281244	3.325129	0.578501	03:14
# 1	1.484431	1.413652	0.765938	03:08
# 2	1.174471	1.098444	0.790480	03:09
# 3	0.992564	0.849274	0.821012	03:10
# 4	0.874084	0.709975	0.842021	03:10
# 5	0.769414	0.605410	0.858727	03:08
# 6	0.700507	0.542126	0.868866	03:08
# 7	0.673132	0.523573	0.870518	03:12

# try 24 epochs
# ds_div: 4, bs: 8, emb_sz: 128, n_layers: 2, sl: 32, lr: 0.001
# epoch	train_loss	valid_loss	accuracy	time
# 0	5.406795	4.948601	0.410613	06:39
# 1	3.266705	3.139425	0.624549	06:40
# 2	2.074166	2.418597	0.669145	06:45
# 3	1.576855	2.224808	0.696416	06:41
# 4	1.498893	2.107253	0.706125	06:37
# 5	1.425497	2.024300	0.718755	06:38
# 6	1.391082	2.117145	0.707255	06:37
# 7	1.368256	1.924885	0.728522	06:38
# 8	1.361894	1.828822	0.744813	06:37
# 9	1.350665	1.753814	0.752652	06:31
# 10	1.339556	1.677444	0.761733	06:34
# 11	1.332376	1.580631	0.771484	06:34
# 12	1.314510	1.491990	0.782072	06:34
# 13	1.298998	1.410088	0.789027	06:42
# 14	1.285932	1.331676	0.795300	06:44
# 15	1.292275	1.289153	0.800699	06:40


# SAVED
# vocab length: 18,260
# ds_div: 2, bs: 64, emb_sz: 128, n_layers: 2, sl: 32, lr: 0.001
# epoch	train_loss	valid_loss	accuracy	time
# 0	4.025664	3.036482	0.657743	06:58
# 1	1.821263	1.476473	0.780719	06:55
# 2	1.225658	0.898250	0.845399	06:58
# 3	0.946340	0.644613	0.879003	06:46
# 4	0.810226	0.555904	0.888096	06:45
# 5	0.740880	0.522233	0.891788	06:41
# 6	0.711271	0.506037	0.893532	06:44
# 7	0.703302	0.500850	0.894477	06:44

# SAVED
# vocab length: 18,260
# ds_div: 2, bs: 8, emb_sz: 128, n_layers: 2, sl: 32, lr: 0.001
# epoch	train_loss	valid_loss	accuracy	time
# 0	4.392361	3.268613	0.601746	13:26
# 1	2.662708	2.112759	0.721673	13:24
# 2	1.782815	1.478820	0.773827	13:22
# 3	1.457928	1.234130	0.800126	13:19
# 4	1.376427	1.260093	0.801919	13:24
# 5	1.315052	1.258130	0.801879	13:46
# 6	1.280592	1.217529	0.805961	13:41
# 7	1.245183	1.205008	0.808077	13:28
# 8	1.228678	1.194641	0.809214	17:04
# 9	1.214260	1.165168	0.812769	13:27
# 10	1.200192	1.159776	0.812537	13:28
# 11	1.194933	1.153038	0.813744	13:24
# 12	1.174281	1.140359	0.815773	13:22
# 13	1.170180	1.129120	0.818285	13:19
# 14	1.159167	1.108682	0.821779	13:20
# 15	1.145710	1.092171	0.823530	13:20
# 16	1.139795	1.081515	0.823574	13:20
# 17	1.126797	1.061210	0.827687	13:20
# 18	1.118184	1.034794	0.831409	13:16
# 19	1.105856	1.008426	0.835146	13:18
# 20	1.088293	0.979371	0.839519	13:19
# 21	1.085581	0.950177	0.843593	13:20
# 22	1.062206	0.916075	0.848545	13:15
# 23	1.049483	0.880122	0.853547	13:14
# 24	1.030618	0.838580	0.859141	13:15
# 25	1.027435	0.811198	0.864295	13:17
# 26	1.010219	0.779236	0.869996	13:18
# 27	0.999372	0.758293	0.872126	13:18
# 28	0.990476	0.740457	0.873269	13:19
# 29	0.985521	0.730486	0.873999	13:20
# 30	0.980521	0.724510	0.874585	3:46:08
# 31	0.980267	0.723266	0.874661	4:36:07

# SAVE xlarge
# vocab length: 19,032
# ds_div: 1, bs: 64, emb_sz: 128, n_layers: 2, sl: 32, lr: 0.001
# epoch	train_loss	valid_loss	accuracy	time
# 0	2.977237	2.647233	0.679372	13:35
# 1	1.058554	0.942487	0.845939	13:42
# 2	0.670738	0.689060	0.872087	13:39
# 3	0.573277	0.571642	0.883391	13:41
# 4	0.524220	0.503763	0.893860	13:40
# 5	0.494350	0.463308	0.901011	13:44
# 6	0.477324	0.442572	0.904965	13:48
# 7	0.471718	0.437447	0.906063	13:37

# Testing

In [24]:
word2idx = {v: k for k, v in enumerate(vocab)}

def words2idx(l):
    return [word2idx[i] for i in l]

def idx2words(l):
    return [vocab[i] for i in l]

[1387,
 2698,
 2699,
 2700,
 2701,
 2702,
 665,
 669,
 2703,
 451,
 452,
 2704,
 1975,
 1976,
 2705,
 2232]

In [345]:
inp = ["KS:bcb", "KS:bcc", "KP:0x102874f8c031d6ffdd1f56044c9694f8da5a94e8544779489cdbe3c6f1aeb", "KS:c3b", "KS:c3c", "KP:0xab17cf9d32a81474da175e525185d4126797203f1935d379f0e5cbbcfe621", "KS:e76", "KS:e77", "KP:0x717fe4ed56f5623deae4023431b785ee24a24f1cc87afbd6841ada19332f5", "KS:df5", "KS:df6", "KP:0x405db1111f2d2da95e58ea488695497e1059df8ca77b713e262cde10b9e09", "KS:1dd", "KS:1de", "KP:0x7bb4d269e2b923067312d8b5314084b3f8c7504c5d71107c3b4d1874dfb08", "KS:297"]
inp_idx = torch.Tensor(words2idx(inp)).type(torch.int)

t = torch.zeros(7, 16, dtype=int)
t = torch.cat((inp_idx.unsqueeze(dim=0), t))

learn.model.eval()
with torch.inference_mode():
#     print(t)
    %timeit learn.model(t)
#     print(results)

# print('inp: ', json.dumps(inp))
# pred = idx2words(results[0].argmax(dim=-1))
# print('results: ', results[0])
# print('pred: ', json.dumps(pred))
# # target = idx2words(train_pred[2][idx])
# # print('target: ', json.dumps(target))
# # print('accuracy: ', len(set(pred).intersection(target)) / len(pred))


# # model.eval()
# # with torch.inference_mode():
# #     results_loaded, _, _ = model(t)
# # pred_loaded = idx2words(results.argmax(dim=-1)[0])
# # print('pred_loaded: ', json.dumps(pred_loaded))
# # print('accuracy_loaded: ', len(set(pred_loaded).intersection(target)) / len(pred_loaded))

2.1 ms ± 8.84 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
