In [1]:
import pandas as pd
import numpy as np
from preproc.filters import markov_filter, rnn_filter
from eda.tools import seq_to_num, acc_score, prep_submit
from models.baseline import Baseline
from models.diff_table import DiffTable
from models.markov_chain import MarkovChain
from models.linear_model import LinearModel
from models.nonlinear_model import NonLinearModel
from models.pipeline import Pipeline
from models.rnn import RNN

Using TensorFlow backend.


In [2]:
df_train = pd.read_csv("data/train.csv", index_col=0)
df_test = pd.read_csv('data/test.csv', index_col=0)

In [3]:
X_train, y_train = seq_to_num(df_train.Sequence, pad=False)
X_test, y_test = seq_to_num(df_test.Sequence, pad=False)

In [14]:
models = [
    ('DT', DiffTable(), None),
    ('LRR', LinearModel(), None),
    ('NLRR', NonLinearModel(), None)
     ('MC', MarkovChain(), markov_filter),
     ('RNN', RNN(), rnn_filter)
]

In [15]:
pipe = Pipeline(models, verbose=True)
ind, pred = pipe.predict(X_train)

solved by DT: 3086
solved by LRR: 6435
solved by NLRR: 367


In [16]:
acc_score(y_train[ind], pred[ind])

0.8518406148867314

In [7]:
df_validate = pd.read_csv('data/kaggle_test.csv', index_col=0)
X_val = seq_to_num(df_validate.Sequence, pad=False, target_split=False)

In [11]:
models = [
    ('DT', DiffTable(), None),
    ('LRR', LinearModel(), None),
    ('NLRR', NonLinearModel(), None),
    ('MC', MarkovChain(), markov_filter),
    ('RNN', RNN(), rnn_filter)
]

In [12]:
pipe = Pipeline(models, fallback=Baseline(function='last'), verbose=True)
pred = pipe.predict(X_val)

solved by DT: 4178
solved by LRR: 9443
solved by NLRR: 588
solved by MC: 2610
solved by RNN: 28654
solved by fallback-model Baseline(function=mode): 68372


In [13]:
prep_submit(pred, 'submit_mode.csv')