In [1]:
import pandas as pd
import numpy as np
from eda.tools import seq_to_num

In [83]:
train_df = pd.read_csv('data/train.csv', index_col=0)
X_train = seq_to_num(train_df.Sequence, pad=False, target_split=False)

In [84]:
rnn_filter = lambda seq: len(seq) > 2 and np.all([0 <= x < 2000 for x in seq])
X_train = X_train[X_train.map(rnn_filter)]

In [85]:
X_train.shape

(27206,)

In [6]:
def prep_data(data, seqlen):
    """
    Generate train dataset, given sequences
    for each sequence, we iterate creating input sequence and target term
    """
    X, y = [], []
    for seq in data:
        if len(seq) <= seqlen:
            X += [list(map(int, [0] * (seqlen - (len(seq) - 1)) + seq[:-1].tolist()))]
            y += [int(seq[-1])]
            continue
        x1 = [seq[i: i + seqlen] for i in range(len(seq) - seqlen)]
        y1 = list(map(int, seq[seqlen:].tolist()))
        X += x1
        y += y1
    X = np.array(X)
    X = np.expand_dims(X, 2)
    y = np.array(y)
    y = np.expand_dims(y, 1)
    return X, y

In [7]:
X, y = prep_data(X_train, 25)

In [8]:
X.shape

(1182807, 25, 1)

In [86]:
def arithmetic_prog(n_terms, a1=1, d=1):
    a = []
    for i in range(n_terms):
        a.append(a1)
        a1 += d
    return a

def geometric_prog(n_terms, a1=1, r=2):
    a = []
    for i in range(n_terms):
        a.append(a1)
        a1 *= r
    return a

In [91]:
a = geometric_prog(25, 10, 3)

In [92]:
list(filter(lambda x: 0 <= x < 2000, a))

[10, 30, 90, 270, 810]

In [171]:
def aug_dispatch(seqlen, largest_start=10, filt=lambda x: 0 <= x < 2000):
    """
    Given length of the sequence, generate sample from common sequences
    """
    a1 = np.random.randint(1, largest_start)
    d = np.random.randint(1, largest_start)
    choice = np.random.randint(0, 2)
    if choice == 0:
        seq = arithmetic_prog(seqlen + 1, a1, d)
    elif choice == 1:
        seq = geometric_prog(seqlen + 1, a1, d)
    seq = list(filter(filt, seq))
    if len(seq) < (seqlen + 1):
        seq = [0] * ((seqlen + 1) - len(seq)) + seq
    return seq[:-1], seq[-1]

In [177]:
aug_dispatch(10)

([5, 11, 17, 23, 29, 35, 41, 47, 53, 59], 65)

In [181]:
def augment_data(X, y, seqlen, aug_frac=.125):
    """
    Fill train dataset with generated samples from various common sequences 
    """
    if aug_frac == 0:
        return X, y
    n_samples = int(len(y) * aug_frac)
    aug_X = np.zeros((n_samples, seqlen))
    aug_y = np.zeros((n_samples, 1))
    for i in np.arange(n_samples):
        aug_X[i], aug_y[i] = aug_dispatch(seqlen)
    X = np.append(X, np.expand_dims(aug_X, 2), axis=0)
    y = np.append(y, aug_y, axis=0)
    return X, y

In [179]:
X_aug, y_aug = augment_data(X, y, 25)

In [228]:
X_aug.shape, y_aug.shape

((1330657, 25, 1), (1330657, 1))

In [242]:
class RNNData:
    def __init__(self, seqlen, aug_frac=0, minval=0, maxval=2000):
        self.seqlen = seqlen
        self.aug_frac = aug_frac
        self.data_filt = lambda seq: len(seq) > 2 and np.all([minval <= x < maxval for x in seq])
        self.val_filt = lambda x: minval <= x < maxval
    
    def transform(self, data):
        """
        Pass data from seq_to_num without padding
        """
        data = data[data.map(self.data_filt)]
        X, y = [], []
        for seq in data:
            if len(seq) <= self.seqlen:
                X += [list(map(int, [0] * (self.seqlen - (len(seq) - 1)) + seq[:-1].tolist()))]
                y += [int(seq[-1])]
                continue
            x1 = [seq[i: i + self.seqlen] for i in range(len(seq) - self.seqlen)]
            y1 = list(map(int, seq[self.seqlen:].tolist()))
            X += x1
            y += y1
        X = np.array(X)
        X = np.expand_dims(X, 2)
        y = np.array(y)
        y = np.expand_dims(y, 1)
        return (X, y) if self.aug_frac == 0 else self.augment_data(X, y)
    
    def augment_data(self, X=None, y=None, n_samples=None):
        """
        Fill train dataset with generated samples from various common sequences
        
        @param n_samples: if not None, only artificial samples are returned
        """
        if self.aug_frac == 0 and n_samples is None:
            return X, y
        only_aug = True
        if n_samples is None:
            only_aug = False
            n_samples = int(len(y) * self.aug_frac)
        aug_X = np.zeros((n_samples, self.seqlen))
        aug_y = np.zeros((n_samples, 1))
        for i in np.arange(n_samples):
            aug_X[i], aug_y[i] = self._aug_dispatch()
        if only_aug:
            return np.expand_dims(aug_X, 2), aug_y
        X = np.append(X, np.expand_dims(aug_X, 2), axis=0)
        y = np.append(y, aug_y, axis=0)
        return X, y
    
    def _aug_dispatch(self, largest_start=10):
        """
        Given length of the sequence, generate sample from common sequences
        """
        a1 = np.random.randint(1, largest_start)
        d = np.random.randint(1, largest_start)
        choice = np.random.randint(0, 2)
        if choice == 0:
            seq = arithmetic_prog(self.seqlen + 1, a1, d)
        elif choice == 1:
            seq = geometric_prog(self.seqlen + 1, a1, d)
        seq = list(filter(self.val_filt, seq))
        if len(seq) < (self.seqlen + 1):
            seq = [0] * ((self.seqlen + 1) - len(seq)) + seq
        return seq[:-1], seq[-1]

In [243]:
rdata = RNNData(seqlen=25, aug_frac=.125)
X, y = rdata.transform(X_train)

In [244]:
X.shape

(1330657, 25, 1)

In [245]:
RNNData(10).augment_data(n_samples=32)

(array([[[  1.],
         [ 10.],
         [ 19.],
         [ 28.],
         [ 37.],
         [ 46.],
         [ 55.],
         [ 64.],
         [ 73.],
         [ 82.]],
 
        [[  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  2.],
         [ 16.],
         [128.]],
 
        [[  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  2.],
         [ 18.],
         [162.]],
 
        [[  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  4.],
         [ 28.],
         [196.]],
 
        [[  3.],
         [  6.],
         [  9.],
         [ 12.],
         [ 15.],
         [ 18.],
         [ 21.],
         [ 24.],
         [ 27.],
         [ 30.]],
 
        [[  9.],
         [ 12.],
         [ 15.],
         [ 18.],
         [ 21.],
         [ 24.],
         [ 27.],
         [ 30.],