In [1]:
import numpy as np
import pandas as pd
import torch
import itertools
from matplotlib import pyplot as plt
import os

In [2]:

def generate_data(generator, task_name, path='data', train_size=10_000, val_size=1_000, test_size=2_000, batch_size=32):
    Xs, ys, _, _ = next(generator)
    total_size = train_size + test_size + val_size
    
    try:
        while(len(Xs) < total_size):
            print(len(Xs))
            X, y, _, _ = next(generator)
            # Xs += list(X)
            # ys += list(y)
            Xs = np.vstack((Xs, X))
            ys = np.vstack((ys, y))
            if len(Xs) > total_size * 2:
                print('length achieved')
                Xs, ys = np.unique(Xs, axis=0), np.unique(ys, axis=0)
    except(KeyboardInterrupt):
        print("Interrupted")
        Xs, ys = np.unique(Xs, axis=0), np.unique(ys, axis=0)

    Xs = np.vstack(Xs)
    ys = np.vstack(ys)
    
    print(Xs.shape, ys.shape)
    # _, inds = np.unique(Xs, axis=0, return_index=True)
    inds = np.random.permutation(range(len(Xs)))
    
    Xs = Xs[inds][:total_size]
    ys = ys[inds][:total_size]
    
    np.save(f'{path}/{task_name}_train_X.npy', Xs[:train_size] )
    np.save(f'{path}/{task_name}_train_y.npy', ys[:train_size] )

    np.save(f'{path}/{task_name}_val_X.npy', Xs[train_size:train_size+val_size] )
    np.save(f'{path}/{task_name}_val_y.npy', ys[train_size:train_size+val_size] )

    np.save(f'{path}/{task_name}_test_X.npy', Xs[train_size+val_size:train_size+val_size+test_size] )
    np.save(f'{path}/{task_name}_test_y.npy', ys[train_size+val_size:train_size+val_size+test_size] )
    # return inds

### Copy

In [9]:
class copy_generator:
    def __init__(self, seq_len, batch_size, num_tokens):
        self.src_mask = torch.ones(batch_size, seq_len).bool()
        self.tgt_mask = torch.ones(batch_size, 2 * seq_len + 1).bool()
        
        self.enc_seq_len = seq_len
        self.dec_seq_len = 2 * seq_len
        self.batch_size = batch_size
        self.num_tokens = num_tokens
    
    def __next__(self):
        X = np.zeros([self.batch_size, self.enc_seq_len]).astype(int)
        y = np.zeros([self.batch_size, self.dec_seq_len+1]).astype(int)
        y[:, 0] = 1
        for i in range(self.batch_size):
            sequence_length = self.enc_seq_len
            random_sequence = np.random.randint(2, self.num_tokens, sequence_length)
            
            X[i, :sequence_length] = random_sequence
            y[i, 1: 2 * sequence_length + 1] = np.concatenate([random_sequence] * 2)

        return X, y, self.src_mask, self.tgt_mask        

In [4]:
# X, y, _, _, = next(gen)

In [11]:
# SEQ_LEN = 24

# task_name = f'copy{SEQ_LEN}'
# BATCH_SIZE = 10_000
# NUM_TOKENS = 10

# train_size = 100_000
# val_size = 10_000
# test_size = 20_000

# path = f'../synthetic/data{SEQ_LEN}'
# os.system(f'mkdir {path}')

# gen = copy_generator(seq_len=SEQ_LEN, batch_size=BATCH_SIZE, num_tokens=NUM_TOKENS)
# generate_data(gen, task_name=task_name, path=path, train_size=train_size, val_size=val_size, test_size=test_size, batch_size=BATCH_SIZE)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
(130000, 48) (130000, 97)


### Reverse

In [4]:
class reverse_generator:
    def __init__(self, seq_len, batch_size, num_tokens):
        self.src_mask = torch.ones(batch_size, seq_len).bool()
        self.tgt_mask = torch.ones(batch_size, seq_len + 1).bool()
        
        self.enc_seq_len = seq_len
        self.dec_seq_len = seq_len
        self.batch_size = batch_size
        self.num_tokens = num_tokens
    
    def __next__(self):
        X = np.zeros([self.batch_size, self.enc_seq_len]).astype(int)
        y = np.zeros([self.batch_size, self.dec_seq_len+1]).astype(int)
        y[:, 0] = 1
        for i in range(self.batch_size):
            sequence_length = self.enc_seq_len
            random_sequence = np.random.randint(2, self.num_tokens, sequence_length)
            
            X[i, :sequence_length] = random_sequence
            y[i, 1: 2 * sequence_length + 1] = random_sequence[::-1]

        return X, y, self.src_mask, self.tgt_mask        

In [7]:
# SEQ_LEN = 240

# task_name = f'reverse{SEQ_LEN}'
# BATCH_SIZE = 10000
# NUM_TOKENS = 10

# train_size = 100_000
# val_size = 10_000
# test_size = 20_000

# path = f'../synthetic/data{SEQ_LEN}'
# os.system(f'mkdir {path}')

# gen = reverse_generator(seq_len=SEQ_LEN, batch_size=BATCH_SIZE, num_tokens=NUM_TOKENS)
# generate_data(gen, task_name=task_name, path=path, train_size=train_size, val_size=val_size, test_size=test_size, batch_size=BATCH_SIZE)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
(130000, 240) (130000, 241)
