In [8]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import GRU, Dense
from eda.tools import seq_to_num
from preproc.filters import rnn_filter

In [15]:
df_train = pd.read_csv('data/train.csv', index_col=0)
X_train = seq_to_num(df_train.Sequence, pad=False, target_split=False)

In [16]:
rnn_filter = lambda seq: len(seq) > 2 and np.all([0 <= x < 2000 for x in seq])
X_train = X_train[X_train.map(rnn_filter)]

In [17]:
seqlen = int(X_train.map(len).quantile(.10))

In [18]:
seqlen

25

In [19]:
def build_model(input_len):
    model = Sequential()
    model.add(GRU(128, input_shape=(input_len, 1)))
    model.add(Dense(2000, activation='softmax'))
    model.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [20]:
m = build_model(seqlen)

In [21]:
def prep_data(data, seqlen):
    """
    Generate train dataset, given sequences
    for each sequence, we iterate creating input sequence and target term
    """
    X, y = [], []
    for seq in data:
        if len(seq) <= seqlen:
            X += [list(map(int, [0] * (seqlen - (len(seq) - 1)) + seq[:-1].tolist()))]
            y += [int(seq[-1])]
            continue
        x1 = [seq[i: i + seqlen] for i in range(len(seq) - seqlen)]
        y1 = list(map(int, seq[seqlen:].tolist()))
        X += x1
        y += y1
    X = np.array(X)
    X = np.expand_dims(X, 2)
    y = np.array(y)
    y = np.expand_dims(y, 1)
    return X, y

In [22]:
X, y = prep_data(X_train, seqlen)

In [23]:
y.shape

(1182807, 1)

In [None]:
m.fit(X, y, batch_size=32, epochs=3)

Epoch 1/3
 153696/1182807 [==>...........................] - ETA: 17:41 - loss: 3.5075 - acc: 0.1641

In [None]:
test = pd.read_csv('data/test.csv', index_col=0)
X_test, y_test = seq_to_num(test.Sequence, pad=True, padlen=seqlen)

In [5]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import GRU, Dense


class RNN:
    def __init__(self, weights_file='pre_train/rnn_weights.h5', input_len=69):
        self.input_len = input_len
        self.model = self._build_model()
        self.model.load_weights(weights_file)
        self.params = {'input_len:': self.input_len, 'model': self.model}
        
    
    def _build_model(self):
        model = Sequential()
        model.add(GRU(128, input_shape=(self.input_len, 1)))
        model.add(Dense(1000, activation='softmax'))
        model.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])
        return model
    
    def _prep_data(self, data):
        """
        Pad sequences in data and expand dimention of features
        """
        ind = data.index if isinstance(data, (np.ndarray, pd.Series)) else range(len(data))
        data = pad_sequences(data, maxlen=self.input_len, dtype='int32')
        data = np.expand_dims(data, 2)
        return data, ind
    
    def predict(self, data):
        data, ind = self._prep_data(data)
        pred = self.model.predict(data)
        pred = np.argmax(pred, axis=1)
        if hasattr(ind, 'tolist'):
            return [], ind.tolist(), pred
        else:
            return [], list(ind), pred
    
    def __repr__(self):
        params = ', '.join([f"{par}={val}" for par, val in self.params.items()])
        return f"{self.__class__.__name__}({params})"
