In [2]:
import random
from typing import List

In [3]:
def generate_equations(allowed_operators: List[str], dataset_size: int, min_value: int, max_value: int)->List:
    equations = []
    for _ in range(dataset_size):
        x = random.randint(min_value, max_value)
        y = random.randint(min_value, max_value)
        operator = random.choice(allowed_operators)
        c = str(x) + operator + str(y)
        equations.append((c, eval(c)))
    return equations

In [4]:
allowed_operators = ['+', '-']
dataset_size = 10
min_value, max_value = 0, 100
res = generate_equations(allowed_operators, dataset_size, min_value, max_value)
res

[('40-9', 31),
 ('23-36', -13),
 ('76-15', 61),
 ('36+48', 84),
 ('47+25', 72),
 ('64+82', 146),
 ('49-32', 17),
 ('21+74', 95),
 ('85-90', -5),
 ('25+65', 90)]

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
dataset = generate_equations(allowed_operators, 100000, min_value, max_value)
train, test = train_test_split(dataset, test_size=.2, random_state=42)

In [22]:
word2id = {word:idx for idx, word in enumerate('^$#+-1234567890')}
id2word = {idx:word for word, idx in word2id.items()}

In [23]:
word2id

{'#': 2,
 '$': 1,
 '+': 3,
 '-': 4,
 '0': 14,
 '1': 5,
 '2': 6,
 '3': 7,
 '4': 8,
 '5': 9,
 '6': 10,
 '7': 11,
 '8': 12,
 '9': 13,
 '^': 0}

In [24]:
id2word

{0: '^',
 1: '$',
 2: '#',
 3: '+',
 4: '-',
 5: '1',
 6: '2',
 7: '3',
 8: '4',
 9: '5',
 10: '6',
 11: '7',
 12: '8',
 13: '9',
 14: '0'}

In [25]:
start_symbol, end_symbol, padding_symbol = '^', '$', '#'

In [38]:
def sentence_to_id(sentence: str, word2id: dict, padded_len):
    sent_len = min(padded_len - 1, len(sentence)) + 1
    sent_ids = [word2id[w] for w in sentence]
    pad_size = max(0, padded_len - len(sent_ids) - 1)
    sent_ids = sent_ids[:padded_len - 1] + [word2id[end_symbol]] + [word2id[padding_symbol]] * pad_size

    return (sent_ids, sent_len)


In [40]:
def ids_to_sentence(ids: List[int], id2word: dict)->List:
    return [id2word[idx] for idx in ids]

In [43]:
l = [5, 6, 7, 3, 5, 6, 7, 1]
ids_to_sentence(l, id2word)

['1', '2', '3', '+', '1', '2', '3', '$']

In [91]:
def batch_to_ids(sentences, word2id, max_len):
      
    max_len_in_batch = min(max(len(s) for s in sentences) + 1, max_len)
    batch_ids, batch_ids_len = [], []
    for sentence in sentences:
        ids, ids_len = sentence_to_id(sentence, word2id, max_len_in_batch)
        batch_ids.append(ids)
        batch_ids_len.append(ids_len)
    return batch_ids, batch_ids_len

In [92]:
def generate_batches(samples, batch_size=64):
    X, Y = [], []
    for i, (x, y) in enumerate(samples, 1):
        X.append(x)
        Y.append(y)
        if i % batch_size == 0:
            yield X, Y
            X, Y = [], []
    if X and Y:
        yield X, Y

In [96]:
sentences = train[0][0]
ids, sent_lens = batch_to_ids(sentences, word2id, max_len=10)
print('Input:', sentences)
print('Ids: {}\nSentences lengths: {}'.format(ids, sent_lens))

Input: 39-60
Ids: [[7, 1], [13, 1], [4, 1], [10, 1], [14, 1]]
Sentences lengths: [2, 2, 2, 2, 2]
