Functions for generating samples from different series

In [1]:
from random import randint
import numpy as np

def arithmetic_prog(n_terms, a1=1, d=1):
    a = []
    for i in range(n_terms):
        a.append(a1)
        a1 += d
    return a

def geometric_prog(n_terms, a1=1, r=2):
    a = []
    for i in range(n_terms):
        a.append(a1)
        a1 *= r
    return a

In [2]:
def random_sequences(n, terms, largest_start):
    """
    Generate random arithmetic sequences
    """
    X, y = [], []
    for i in range(n):
        a1 = randint(1, largest_start)
        d = randint(1, largest_start)
        seq = arithmetic_prog(terms, a1, d)
        X.append(seq[:-1])
        y.append(seq[-1])
    return X, y


In [3]:
X, y = random_sequences(10, 10, 5)

In [4]:
def to_str(X, y):
    Xstr = []
    for seq in X:
        Xstr.append(','.join([str(_) for _ in seq]))
    ystr = []
    for res in y:
        ystr.append(str(res))
    return Xstr, ystr

In [129]:
X, y = to_str(X, y)

In [15]:
def seq_encode(X, y, alphabet):
    """
    Encode sequence string as a list of indices in some alphabet
    """
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    Xenc = []
    for seq in X:
        seq_enc = [char_to_int[c] for c in seq]
        Xenc.append(seq_enc)
    yenc = []
    for res in y:
        res_enc = [char_to_int[c] for c in res]
        yenc.append(res_enc)
    return Xenc, yenc

In [16]:
alphabet = tuple(" 0123456789,")
X, y = seq_encode(X, y, alphabet)

In [17]:
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, maxlen=20, value=0, dtype=np.int16)
y = pad_sequences(y, maxlen=4, value=0, dtype=np.int16)

In [18]:
def one_hot_encode(X, y, maxint):
    Xenc = []
    for seq in X:
        pattern = []
        for i in seq:
            vec = [0 for _ in range(maxint)]
            vec[i] = 1
            pattern.append(vec)
        Xenc.append(pattern)
    yenc = []
    for seq in y:
        pattern = []
        for i in seq:
            vec = [0 for _ in range(maxint)]
            vec[i] = 1
            pattern.append(vec)
        yenc.append(pattern)
    return Xenc, yenc

In [19]:
X, y = one_hot_encode(X, y, len(alphabet))

In [20]:
def invert(seq, alphabet):
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    strings = []
    for pattern in seq:
        s = []
        for sym in pattern:
            s.append(int_to_char[np.argmax(sym)])
        strings.append(''.join(s))
    return strings

In [21]:
invert(y, alphabet)

['  50', '7120']

In [141]:
def prep_data(n_samples, X_maxlen, y_maxlen, alphabet):
    # alphabet should start with padding value
    X, y = random_sequences(n_samples, terms=10, largest_start=10)
    X, y = to_str(X, y)
    X, y = seq_encode(X, y, alphabet)
    X = pad_sequences(X, maxlen=X_maxlen, value=0, dtype=np.int16)
    y = pad_sequences(y, maxlen=y_maxlen, value=0, dtype=np.int16)
    X, y = one_hot_encode(X, y, len(alphabet))
    return np.array(X), np.array(y)

In [164]:
invert(prep_data(1, 30, 3, alphabet)[0], alphabet)

['      5,8,11,14,17,20,23,26,29']

In [172]:
def gen_data(X_maxlen, y_maxlen, alphabet, batch_size=32):
    # alphabet should start with padding value
    while True:
        X, y = random_sequences(batch_size, terms=10, largest_start=10)
        X, y = to_str(X, y)
        X, y = seq_encode(X, y, alphabet)
        X = pad_sequences(X, maxlen=X_maxlen, value=0, dtype=np.int16)
        y = pad_sequences(y, maxlen=y_maxlen, value=0, dtype=np.int16)
        X, y = one_hot_encode(X, y, len(alphabet))
        yield np.array(X), np.array(y)

In [168]:
in_length, out_length = 30, 3
n_chars = len(alphabet)

In [166]:
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, RepeatVector, Dense

In [171]:
model = Sequential()
model.add(LSTM(10, input_shape=(in_length, n_chars)))
model.add(RepeatVector(out_length))
model.add(LSTM(10, return_sequences=True))
model.add(TimeDistributed(Dense(n_chars, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 10)                920       
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 3, 10)             0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 3, 10)             840       
_________________________________________________________________
time_distributed_3 (TimeDist (None, 3, 12)             132       
Total params: 1,892
Trainable params: 1,892
Non-trainable params: 0
_________________________________________________________________


In [174]:
hist = model.fit_generator(gen_data(in_length, out_length, alphabet), epochs=10, steps_per_epoch=1000)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [183]:
## Evaluate:
def compare_predictions(model, n_examples):
    X, y = prep_data(n_examples, in_length, out_length, alphabet)
    res = model.predict(X)
    expected = invert(y, alphabet)
    predicted = invert(res, alphabet)
    for i in range(n_examples):
        print(f"Expected={expected[i]}, Predicted={predicted[i]}")

compare_predictions(model, 5)

Expected= 40, Predicted= 40
Expected= 80, Predicted= 80
Expected= 16, Predicted= 16
Expected= 32, Predicted= 32
Expected= 51, Predicted= 52


In [184]:
# now, let's redefine random_sequences function to produce either arithmetic or geometric progression
def random_sequences(n, terms, largest_start):
    """
    Generate random arithmetic sequences
    """
    X, y = [], []
    for i in range(n):
        a1 = randint(1, largest_start)
        d = randint(1, largest_start)
        if randint(a=0, b=1) == 1:
            seq = arithmetic_prog(terms, a1, d)
        else:
            seq = geometric_prog(terms, a1, d)
        X.append(seq[:-1])
        y.append(seq[-1])
    return X, y


In [210]:
# let's also enchance our model architecture
in_length, out_length = 100, 10
n_chars = len(alphabet)

from keras.layers import GRU
model_big = Sequential()
model_big.add(GRU(100, input_shape=(in_length, n_chars)))
model_big.add(RepeatVector(out_length))
model_big.add(GRU(100, return_sequences=True))
model_big.add(GRU(100, return_sequences=True))
model_big.add(TimeDistributed(Dense(n_chars, activation='softmax')))
model_big.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_big.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_4 (GRU)                  (None, 100)               33900     
_________________________________________________________________
repeat_vector_4 (RepeatVecto (None, 10, 100)           0         
_________________________________________________________________
gru_5 (GRU)                  (None, 10, 100)           60300     
_________________________________________________________________
gru_6 (GRU)                  (None, 10, 100)           60300     
_________________________________________________________________
time_distributed_5 (TimeDist (None, 10, 12)            1212      
Total params: 155,712
Trainable params: 155,712
Non-trainable params: 0
_________________________________________________________________


In [212]:
big_hist = model_big.fit_generator(gen_data(in_length, out_length, alphabet), epochs=10, steps_per_epoch=1000)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1331c1588>

In [218]:
compare_predictions(model_big, 10)

Expected=        88, Predicted=        88
Expected=7000000000, Predicted=7000000000
Expected=        13, Predicted=        13
Expected= 242121642, Predicted= 242121642
Expected=        65, Predicted=        65
Expected=        23, Predicted=        23
Expected=        93, Predicted=        93
Expected=  10077696, Predicted=  10077696
Expected=        71, Predicted=        71
Expected=    137781, Predicted=    137781


## Using actual data
Now we're talking about. Let's replace our `random_sequences` with sampling from the subset of train dataset

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("../data/train.csv", index_col=0)
test_df = pd.read_csv("../data/test.csv", index_col=0)

In [7]:
df.head()

Unnamed: 0_level_0,Sequence
Id,Unnamed: 1_level_1
222447,"10,11,12,13,14,15,16,17,18,19,1011,21,1112,111..."
126500,"1,0,2,0,0,6,0,0,0,21,0,0,0,3,79,0,0,0,0,41,311..."
217366,"1,2,4,8,15,30,58,114,225,443,871,1715,3375,664..."
4216,"1,2,12,432,31104,6718464,8707129344,2256887925..."
65934,"1,2,0,3,0,0,1,4,1,0,2,0,0,2,0,5,0,2,0,0,0,4,2,..."


In [8]:
def prep_dataset(data):
    """
    Given dataset with Sequence only, split it into X,y
    """
    #df['numbers'] = df.Sequence.str.split(',').map(np.float128)    
    df = data.Sequence.str.rpartition(',').iloc[:, [0, 2]]
    df.rename(columns={0: 'sequence', 2: 'ending'}, inplace=True)
    return df

In [9]:
train_df = prep_dataset(df)
test_df = prep_dataset(test_df)

In [10]:
test_df.head()

Unnamed: 0_level_0,sequence,ending
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
180820,"1,2,3,6,11,14,29,44,64,65,74,92,106,127,153,16...",2617
220915,"1,50798448,190026633752982,1646057381698954570...",52047326332129638504907000521132040
182711,"41,27,23,21,141,63,49,301,43,167,89,521,203,67...",427
40043,"15,20,21,28,35,39,44,48,51,52,55,65,69,85,91,9...",365
144094,"1,1,5,49,653,10201,174965,3188641,60623645,118...",2110916340429978173


In [11]:
def random_sampling(data, n, terms):
    """
    Generate random samples from data
    """
    samples = data.sample(n)
    X = samples.sequence.map(lambda x: x if x.count(',') < terms else x.split(',', (x.count(',') - (terms - 1)))[-1])
    y = samples.ending
    return X, y
X, y = random_sampling(train_df, 2, 10)

In [23]:
alphabet=' 0123456789,-'
n_chars = len(alphabet)
n_terms = 10
in_length = 200
out_length = 20

def prep_sampling(data, n_samples, X_maxlen, y_maxlen, alphabet):
    """
    Sample n samples from data, encode them and add padding
    """
    # alphabet should start with padding value
    X, y = random_sampling(data, n_samples, terms=n_terms)
    X, y = seq_encode(X, y, alphabet)
    X = pad_sequences(X, maxlen=X_maxlen, value=0, dtype=np.int16)
    y = pad_sequences(y, maxlen=y_maxlen, value=0, dtype=np.int16)
    X, y = one_hot_encode(X, y, len(alphabet))
    return np.array(X), np.array(y)

def train_sampling(X_maxlen, y_maxlen, alphabet, batch_size=32):
    # alphabet should start with padding value
    while True:
        X, y = prep_sampling(train_df, batch_size, X_maxlen, y_maxlen, alphabet)
        yield np.array(X), np.array(y)

def val_sampling(X_maxlen, y_maxlen, alphabet, batch_size=32):
    while True:
        X, y = prep_sampling(test_df, batch_size, X_maxlen, y_maxlen, alphabet)
        yield np.array(X), np.array(y)

In [24]:
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed
from keras.models import Sequential

# Building model now
mod = Sequential()
mod.add(LSTM(32, input_shape=(in_length, n_chars)))
mod.add(RepeatVector(out_length))
mod.add(LSTM(32, return_sequences=True))
mod.add(TimeDistributed(Dense(n_chars, activation='softmax')))
mod.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy', 'mae'])
mod.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 32)                5888      
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 20, 32)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 20, 32)            8320      
_________________________________________________________________
time_distributed_2 (TimeDist (None, 20, 13)            429       
Total params: 14,637
Trainable params: 14,637
Non-trainable params: 0
_________________________________________________________________


In [29]:
mod.fit_generator(train_sampling(in_length, out_length, alphabet), epochs=5, steps_per_epoch=250)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x133a22a58>

In [30]:
def compare_predictions(model, n_examples):
    X, y = prep_sampling(train_df, n_examples, in_length, out_length, alphabet)
    res = model.predict(X)
    expected = invert(y, alphabet)
    predicted = invert(res, alphabet)
    for i in range(n_examples):
        print(f"Expected={expected[i]}, Predicted={predicted[i]}")

In [31]:
compare_predictions(mod, 20)

Expected=  135863147487423972, Predicted=  111999111111111110
Expected=                 572, Predicted=                 110
Expected=        801373175814, Predicted=          6777799999
Expected=34406400000000000000, Predicted= 0000000000000000000
Expected=                 147, Predicted=                 111
Expected=                   1, Predicted=                   0
Expected=               14237, Predicted=               11100
Expected=                3066, Predicted=                1110
Expected=          2326692356, Predicted=          1111111100
Expected=                2222, Predicted=                1111
Expected=         11730347948, Predicted=         12559911100
Expected=           182025792, Predicted=          1111111111
Expected=             9426681, Predicted=           100000000
Expected=                 841, Predicted=                 111
Expected=             7258701, Predicted=            11111111
Expected=               12121, Predicted=               11111
Expected

In [32]:
def accuracy_score(model, data):
    """
    Compute accuracy score for the given dataset
    """
    X, y = prep_sampling(data, data.shape[0], in_length, out_length, alphabet)
    res = model.predict(X)
    cnt_correct = (res == y).sum()
    print(f"Accuracy score: {cnt_correct/res.shape[0]}")

In [33]:
accuracy_score(mod, train_df)

Accuracy score: 0.0
