In [1]:
import numpy as np
import pandas as pd
from tools import seq_to_num, acc_score

In [29]:
df_train = pd.read_csv('../data/kaggle_train.csv', index_col=0)
X_train, y_train = seq_to_num(df_train.Sequence, pad=False)

In [4]:
value_counts = {}
for seq in X_train:
    for val, count in zip(*np.unique(seq, return_counts=True)):
        if val not in value_counts:
            value_counts[val] = count
        else:
            value_counts[val] += count

In [5]:
sorted_counts = sorted(value_counts.items(), key=lambda x: x[1], reverse=True)

In [27]:
len(sorted_counts)

416016

In [14]:
sum([count for val, count in sorted_counts if val < 1000])

2422707

In [30]:
X_under = X_train.map(lambda seq: np.all([0 <= x < 1000 for x in seq]))
y_under = y_train.map(lambda val: 0 <= val < 1000)

In [31]:
X_under1000 = X_train[X_under & y_under]
y_under1000 = np.expand_dims(y_train[X_under & y_under], -1)

In [32]:
X_under1000.shape, y_under1000.shape

((35147,), (35147, 1))

In [33]:
# pad sequences
from keras.preprocessing.sequence import pad_sequences

In [34]:
med_length = int(np.median(X_under1000.map(lambda seq: len(seq))))

In [35]:
Xp_under1000 = pad_sequences(X_under1000, maxlen=med_length, dtype='int32')
Xp_under1000 = np.expand_dims(Xp_under1000, 2)

In [36]:
Xp_under1000.shape

(35147, 69, 1)

In [37]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, LSTM, Dropout, TimeDistributed, Dense, Flatten
def build_graph(seqlen):
    model = Sequential()
#     model.add(Embedding(seqlen, 300, mask_zero=True))
    model.add(GRU(128, input_shape=(seqlen, 1)))
    model.add(Dense(1000, activation='softmax'))
    return model

In [38]:
m = build_graph(med_length)

In [39]:
m.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])

In [42]:
hist = m.fit(Xp_under1000, y_under1000, 32, epochs=14)

Epoch 1/2
Epoch 2/2


In [44]:
m.save_weights('../models/rnn_weights.h5')

In [20]:
df_test = pd.read_csv('../data/test.csv', index_col=0)
test_X, test_y = seq_to_num(df_test.Sequence, pad=False)

In [196]:
def rnn_prep_data(X, seqlen, y=None, minval=0, maxval=1000, pad=True):
    under_x = X.map(lambda seq: np.all([minval <= x < maxval for x in seq]))
    if y is not None:
        under_y = y.map(lambda val: minval <= val < maxval)
        X = X[under_x & under_y]
        X = pad_sequences(X, maxlen=seqlen, dtype='int32')
        X = np.expand_dims(X, 2)
        y = np.expand_dims(y[under_x & under_y], -1)
        return X, y
    else:
        X = X[under_x]
        ind = X.index
        if pad:
            X = pad_sequences(X, maxlen=seqlen, dtype='int32')
            X = np.expand_dims(X, 2)
        return X, ind

In [23]:
X_test, y_test = rnn_prep_data(test_X, test_y, med_length)

In [69]:
pred = m.predict(X_test)
pred = np.argmax(pred, axis=1)

In [70]:
acc_score(y_test, pred)

0.14337533409698358

In [178]:
def gen_seq(data, seqlen=68, batch_size=32):
    while True:
        s = data.sample(batch_size)
        X = np.zeros((batch_size, seqlen), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for ind, (seq, i) in enumerate(zip(s, s.index)):
            if len(seq) < seqlen + 1:
                s[i] = np.hstack(([0] * (seqlen - (len(seq) - 1)), seq))
                X[ind] = s[i][:-1]
                y[ind] = s[i][-1]
            else:
                start = np.random.randint(len(seq) - seqlen)
                s[i] = seq[start:start+seqlen + 1]
                X[ind] = s[i][:-1]
                y[ind] = s[i][-1]
        yield np.expand_dims(X, 2), np.expand_dims(y, 1)

In [258]:
whole_data = pd.read_csv('../data/kaggle_train.csv', index_col=0)
whole_seq = seq_to_num(whole_data.Sequence, pad=False, target_split=False)

In [259]:
test_data = pd.read_csv('../data/kaggle_test.csv', index_col=0)
test_seq = seq_to_num(test_data.Sequence, pad=False, pad_maxlen=40, pad_adaptive=False, target_split=False)

In [263]:
combined = pd.concat((whole_seq, test_seq))

In [267]:
whole_X, ind = rnn_prep_data(combined, 68, pad=False)

In [268]:
whole_X.shape

(71151,)

In [271]:
test_prep, ind = rnn_prep_data(test_seq, seqlen=40, pad=True)

In [269]:
mod2 = build_graph(40)
mod2.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])

In [270]:
hist = mod2.fit_generator(gen_seq(whole_X, seqlen=40), steps_per_epoch=1000, epochs=5)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [273]:
predictions = mod2.predict(test_prep)

In [274]:
predictions = np.argmax(predictions, axis=1)

In [276]:
data_unprocessed = test_seq[~test_seq.index.isin(ind)]

In [204]:
import sys
sys.path.append('..')

In [205]:
from models.rec_rel import LinRecRel
from models.diff_table import DiffTable
from models.baseline import Baseline
from models.pipeline import Pipeline

In [277]:
models = [('DT', DiffTable(stoplen=4, maxstep=10)),
          ('LRR', LinRecRel(max_order=3, minlen=20))]
fallback = Baseline()
pipe = Pipeline(models, fallback)
pipe_predicted = pipe.predict(data_unprocessed)

In [278]:
whole_predict = pd.Series(np.zeros(len(test_data)), index=test_data.index)

In [279]:
whole_predict[ind] = predictions

In [280]:
whole_predict[pipe_predicted.index] = pipe_predicted

In [281]:
from tools import prep_submit

In [282]:
prep_submit(whole_predict)