In [21]:
from os import path
import importlib

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)

<torch._C.Generator at 0x20efff4c390>

In [22]:
import importlib
import test_data as td
from test_data import set_test_data
import tokenizer as to
from tokenizer import Tokenizer
importlib.reload(to)

set_test_data(
    data_count=10000, 
    moving_av=True)

tokenizer = Tokenizer(td.VALUE)
tokenizer.set_quantization_limits() 
shift = 0
window = 120 
time_temp_value = []
while shift + window < len(td.VALUE):
    time_temp_value.extend(tokenizer.get_sentence(td.VALUE[shift: shift + window]))
    shift += window

Test data size (flats are duducted) is 5754
Test data start time is 2023:01:03 21:34
Test data end time is   2023:01:09 21:39
Subtracting moving avarage: True


In [23]:
from core import DATA_STORE

len(time_temp_value)
whole_story = Tokenizer.get_sentence_str(time_temp_value)
with open(
    path.join(DATA_STORE, f'whole_story_{Tokenizer.window}_{Tokenizer.number_pieces}.txt'), 'w') as f:
    for word in whole_story:
        f.write(f'{str(word)}\n')

In [24]:
words = Tokenizer.get_words_from_file()
print('len(words): ', len(words))
print('words[:3]:\n', words[:3])

len(words):  634
words[:3]:
 [('6516', 1736), ('6506', 1617), ('6606', 1412)]


Większość słów ma niewiele powtórzeń: te słowa zastąpię jednym: '0000'.

In [25]:
vocabulary = [_[0] for _ in words if _[1] > words[0][1] * 0.05]

print('len(vocabulary): ', len(vocabulary))
print('vocabulary[:3]:\n', vocabulary[:3])

len(vocabulary):  151
vocabulary[:3]:
 ['6516', '6506', '6606']


Wszystkie dane historyczne przepisuję na słowa mam teraz ciąg zdarzeń który podzielę na części do treningu i do testów.

In [26]:
org_story = Tokenizer.get_story_from_file()
story = [_ if _ in vocabulary else Tokenizer.none_word for _ in org_story]

print('len(story): ', len(story))
print('none-word count: ', story.count(Tokenizer.none_word))
print('story[:3]:\n', story[:3])

len(story):  517
none-word count:  88
story[:3]:
 ['0000', '7414', '7514']


In [27]:
data_len = len(story) 
training_data_raw = story[: (data_len // 3) * 2]
test_data_raw = story[data_len // 3:]

training_data_raw = [(_, 1) for _ in training_data_raw]
print(training_data_raw[:2])

[('0000', 1), ('7414', 1)]


Training data preparation

In [28]:
import torch
import torch.nn as nn
CONTEXT_SIZE = 5  # 2 words to the left, 2 to the right
EMDEDDING_DIM = 10
raw_text = training_data_raw

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)
# By deriving a set from `raw_text`, we deduplicate the array 

vocab = set(raw_text)
vocab_size = len(vocab)
word_to_ix = {word:ix for ix, word in enumerate(vocab)}
ix_to_word = {ix:word for ix, word in enumerate(vocab)}

training_data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = []
    for k in range(CONTEXT_SIZE):
        context.append(raw_text[i - (CONTEXT_SIZE - k)])
    for k in range(CONTEXT_SIZE):
        context.append(raw_text[i + 1 + k])
    # context = [raw_text[i - 2], raw_text[i - 1],
    #            raw_text[i + 1], raw_text[i + 2]]
    # [(['4517', '7417', '5617', '6617'], '6617')]
    target = raw_text[i]
    training_data.append((context, target))

print('\nthe first tuples of the training target:\n', training_data[:1])


the first tuples of the training target:
 [([('0000', 1), ('7414', 1), ('7514', 1), ('6514', 1), ('6514', 1), ('6514', 1), ('7514', 1), ('6614', 1), ('6514', 1), ('7414', 1)], ('5614', 1))]


Defining the CBOW Model

In [29]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        # out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        # out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        
    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out
    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

Defining Parameters and Training

In [30]:
model = CBOW(vocab_size, EMDEDDING_DIM)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

count = 0
for epoch in range(30):
    total_loss = 0
    for context, target in training_data:
        context_vector = make_context_vector(context, word_to_ix)
    log_probs = model(context_vector)
    loss = loss_function(log_probs, torch.tensor([word_to_ix[target]]))

    total_loss += loss
    count += 1
    if count % 15 == 0:
        print(f'{loss:.2e}')
    #optimize at the end of each epoch
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

print('DONE')

2.85e-01
8.12e-02
DONE


Testing

In [32]:
context = ['6501', '6502', '6402', '6501', '0000', '6512', '5413', '7513', '6512', '0000']
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Context: ['6501', '6502', '6402', '6501', '0000', '6512', '5413', '7513', '6512', '0000']

Prediction: 6500


In [33]:
count = 0
for epoch in range(300):
    total_loss = 0
    for context, target in data:
        context_vector = make_context_vector(context, word_to_ix)
    log_probs = model(context_vector)
    loss = loss_function(log_probs, torch.tensor([word_to_ix[target]]))

    total_loss += loss
    count += 1
    if count % 15 == 0:
        print(f'{loss:.2e}')
    #optimize at the end of each epoch
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

print('DONE')

6.39e-03
6.07e-03
5.78e-03
5.52e-03
5.28e-03
5.06e-03
4.85e-03
4.67e-03
4.49e-03
4.33e-03
4.18e-03
4.04e-03
3.91e-03
3.78e-03
3.67e-03
3.56e-03
3.45e-03
3.36e-03
3.26e-03
3.18e-03
DONE


In [34]:
context = ['6501', '6502', '6402', '6501', '0000', '6512', '5413', '7513', '6512', '0000']
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Context: ['6501', '6502', '6402', '6501', '0000', '6512', '5413', '7513', '6512', '0000']

Prediction: 6500
